1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
3; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
4; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
5; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
6; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
7; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
8
9; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
10
11define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
12; SSE-LABEL: test_mm_add_ps:
13; SSE:       # %bb.0:
14; SSE-NEXT:    addps %xmm1, %xmm0 # encoding: [0x0f,0x58,0xc1]
15; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
16;
17; AVX1-LABEL: test_mm_add_ps:
18; AVX1:       # %bb.0:
19; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x58,0xc1]
20; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
21;
22; AVX512-LABEL: test_mm_add_ps:
23; AVX512:       # %bb.0:
24; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
25; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
26  %res = fadd <4 x float> %a0, %a1
27  ret <4 x float> %res
28}
29
30define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
31; SSE-LABEL: test_mm_add_ss:
32; SSE:       # %bb.0:
33; SSE-NEXT:    addss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x58,0xc1]
34; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
35;
36; AVX1-LABEL: test_mm_add_ss:
37; AVX1:       # %bb.0:
38; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x58,0xc1]
39; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
40;
41; AVX512-LABEL: test_mm_add_ss:
42; AVX512:       # %bb.0:
43; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1]
44; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
45  %ext0 = extractelement <4 x float> %a0, i32 0
46  %ext1 = extractelement <4 x float> %a1, i32 0
47  %fadd = fadd float %ext0, %ext1
48  %res = insertelement <4 x float> %a0, float %fadd, i32 0
49  ret <4 x float> %res
50}
51
52define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
53; SSE-LABEL: test_mm_and_ps:
54; SSE:       # %bb.0:
55; SSE-NEXT:    andps %xmm1, %xmm0 # encoding: [0x0f,0x54,0xc1]
56; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
57;
58; AVX1-LABEL: test_mm_and_ps:
59; AVX1:       # %bb.0:
60; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x54,0xc1]
61; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
62;
63; AVX512-LABEL: test_mm_and_ps:
64; AVX512:       # %bb.0:
65; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1]
66; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
67  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
68  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
69  %res = and <4 x i32> %arg0, %arg1
70  %bc = bitcast <4 x i32> %res to <4 x float>
71  ret <4 x float> %bc
72}
73
74define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
75; SSE-LABEL: test_mm_andnot_ps:
76; SSE:       # %bb.0:
77; SSE-NEXT:    andnps %xmm1, %xmm0 # encoding: [0x0f,0x55,0xc1]
78; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
79;
80; AVX1-LABEL: test_mm_andnot_ps:
81; AVX1:       # %bb.0:
82; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x76,0xd2]
83; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xef,0xc2]
84; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xdb,0xc1]
85; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
86;
87; AVX512-LABEL: test_mm_andnot_ps:
88; AVX512:       # %bb.0:
89; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f]
90; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1]
91; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
92  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
93  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
94  %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
95  %res = and <4 x i32> %not, %arg1
96  %bc = bitcast <4 x i32> %res to <4 x float>
97  ret <4 x float> %bc
98}
99
100define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
101; SSE-LABEL: test_mm_cmpeq_ps:
102; SSE:       # %bb.0:
103; SSE-NEXT:    cmpeqps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x00]
104; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
105;
106; AVX1-LABEL: test_mm_cmpeq_ps:
107; AVX1:       # %bb.0:
108; AVX1-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x00]
109; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
110;
111; AVX512-LABEL: test_mm_cmpeq_ps:
112; AVX512:       # %bb.0:
113; AVX512-NEXT:    vcmpeqps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x00]
114; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
115; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
116  %cmp = fcmp oeq <4 x float> %a0, %a1
117  %sext = sext <4 x i1> %cmp to <4 x i32>
118  %res = bitcast <4 x i32> %sext to <4 x float>
119  ret <4 x float> %res
120}
121
122define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
123; SSE-LABEL: test_mm_cmpeq_ss:
124; SSE:       # %bb.0:
125; SSE-NEXT:    cmpeqss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x00]
126; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
127;
128; AVX-LABEL: test_mm_cmpeq_ss:
129; AVX:       # %bb.0:
130; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x00]
131; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
132  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
133  ret <4 x float> %res
134}
135declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
136
137define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
138; SSE-LABEL: test_mm_cmpge_ps:
139; SSE:       # %bb.0:
140; SSE-NEXT:    cmpleps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x02]
141; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
142; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
143;
144; AVX1-LABEL: test_mm_cmpge_ps:
145; AVX1:       # %bb.0:
146; AVX1-NEXT:    vcmpleps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x02]
147; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
148;
149; AVX512-LABEL: test_mm_cmpge_ps:
150; AVX512:       # %bb.0:
151; AVX512-NEXT:    vcmpleps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x02]
152; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
153; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
154  %cmp = fcmp ole <4 x float> %a1, %a0
155  %sext = sext <4 x i1> %cmp to <4 x i32>
156  %res = bitcast <4 x i32> %sext to <4 x float>
157  ret <4 x float> %res
158}
159
160define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
161; SSE-LABEL: test_mm_cmpge_ss:
162; SSE:       # %bb.0:
163; SSE-NEXT:    cmpless %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x02]
164; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
165; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
166; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
167;
168; AVX-LABEL: test_mm_cmpge_ss:
169; AVX:       # %bb.0:
170; AVX-NEXT:    vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02]
171; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
172; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
173; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
174  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
175  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
176  ret <4 x float> %res
177}
178
179define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
180; SSE-LABEL: test_mm_cmpgt_ps:
181; SSE:       # %bb.0:
182; SSE-NEXT:    cmpltps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x01]
183; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
184; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
185;
186; AVX1-LABEL: test_mm_cmpgt_ps:
187; AVX1:       # %bb.0:
188; AVX1-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x01]
189; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
190;
191; AVX512-LABEL: test_mm_cmpgt_ps:
192; AVX512:       # %bb.0:
193; AVX512-NEXT:    vcmpltps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x01]
194; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
195; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
196  %cmp = fcmp olt <4 x float> %a1, %a0
197  %sext = sext <4 x i1> %cmp to <4 x i32>
198  %res = bitcast <4 x i32> %sext to <4 x float>
199  ret <4 x float> %res
200}
201
202define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
203; SSE-LABEL: test_mm_cmpgt_ss:
204; SSE:       # %bb.0:
205; SSE-NEXT:    cmpltss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x01]
206; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
207; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
208; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
209;
210; AVX-LABEL: test_mm_cmpgt_ss:
211; AVX:       # %bb.0:
212; AVX-NEXT:    vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01]
213; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
214; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
215; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
216  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
217  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
218  ret <4 x float> %res
219}
220
221define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
222; SSE-LABEL: test_mm_cmple_ps:
223; SSE:       # %bb.0:
224; SSE-NEXT:    cmpleps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x02]
225; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
226;
227; AVX1-LABEL: test_mm_cmple_ps:
228; AVX1:       # %bb.0:
229; AVX1-NEXT:    vcmpleps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x02]
230; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
231;
232; AVX512-LABEL: test_mm_cmple_ps:
233; AVX512:       # %bb.0:
234; AVX512-NEXT:    vcmpleps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]
235; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
236; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
237  %cmp = fcmp ole <4 x float> %a0, %a1
238  %sext = sext <4 x i1> %cmp to <4 x i32>
239  %res = bitcast <4 x i32> %sext to <4 x float>
240  ret <4 x float> %res
241}
242
243define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
244; SSE-LABEL: test_mm_cmple_ss:
245; SSE:       # %bb.0:
246; SSE-NEXT:    cmpless %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x02]
247; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
248;
249; AVX-LABEL: test_mm_cmple_ss:
250; AVX:       # %bb.0:
251; AVX-NEXT:    vcmpless %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x02]
252; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
253  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2)
254  ret <4 x float> %res
255}
256
257define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
258; SSE-LABEL: test_mm_cmplt_ps:
259; SSE:       # %bb.0:
260; SSE-NEXT:    cmpltps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x01]
261; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
262;
263; AVX1-LABEL: test_mm_cmplt_ps:
264; AVX1:       # %bb.0:
265; AVX1-NEXT:    vcmpltps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x01]
266; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
267;
268; AVX512-LABEL: test_mm_cmplt_ps:
269; AVX512:       # %bb.0:
270; AVX512-NEXT:    vcmpltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x01]
271; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
272; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
273  %cmp = fcmp olt <4 x float> %a0, %a1
274  %sext = sext <4 x i1> %cmp to <4 x i32>
275  %res = bitcast <4 x i32> %sext to <4 x float>
276  ret <4 x float> %res
277}
278
279define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
280; SSE-LABEL: test_mm_cmplt_ss:
281; SSE:       # %bb.0:
282; SSE-NEXT:    cmpltss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x01]
283; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
284;
285; AVX-LABEL: test_mm_cmplt_ss:
286; AVX:       # %bb.0:
287; AVX-NEXT:    vcmpltss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x01]
288; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
289  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1)
290  ret <4 x float> %res
291}
292
293define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
294; SSE-LABEL: test_mm_cmpneq_ps:
295; SSE:       # %bb.0:
296; SSE-NEXT:    cmpneqps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x04]
297; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
298;
299; AVX1-LABEL: test_mm_cmpneq_ps:
300; AVX1:       # %bb.0:
301; AVX1-NEXT:    vcmpneqps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x04]
302; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
303;
304; AVX512-LABEL: test_mm_cmpneq_ps:
305; AVX512:       # %bb.0:
306; AVX512-NEXT:    vcmpneqps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x04]
307; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
308; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
309  %cmp = fcmp une <4 x float> %a0, %a1
310  %sext = sext <4 x i1> %cmp to <4 x i32>
311  %res = bitcast <4 x i32> %sext to <4 x float>
312  ret <4 x float> %res
313}
314
315define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
316; SSE-LABEL: test_mm_cmpneq_ss:
317; SSE:       # %bb.0:
318; SSE-NEXT:    cmpneqss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x04]
319; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
320;
321; AVX-LABEL: test_mm_cmpneq_ss:
322; AVX:       # %bb.0:
323; AVX-NEXT:    vcmpneqss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x04]
324; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
325  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4)
326  ret <4 x float> %res
327}
328
329define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
330; SSE-LABEL: test_mm_cmpnge_ps:
331; SSE:       # %bb.0:
332; SSE-NEXT:    cmpnleps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x06]
333; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
334; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
335;
336; AVX1-LABEL: test_mm_cmpnge_ps:
337; AVX1:       # %bb.0:
338; AVX1-NEXT:    vcmpnleps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x06]
339; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
340;
341; AVX512-LABEL: test_mm_cmpnge_ps:
342; AVX512:       # %bb.0:
343; AVX512-NEXT:    vcmpnleps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x06]
344; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
345; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
346  %cmp = fcmp ugt <4 x float> %a1, %a0
347  %sext = sext <4 x i1> %cmp to <4 x i32>
348  %res = bitcast <4 x i32> %sext to <4 x float>
349  ret <4 x float> %res
350}
351
352define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
353; SSE-LABEL: test_mm_cmpnge_ss:
354; SSE:       # %bb.0:
355; SSE-NEXT:    cmpnless %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x06]
356; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
357; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
358; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
359;
360; AVX-LABEL: test_mm_cmpnge_ss:
361; AVX:       # %bb.0:
362; AVX-NEXT:    vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06]
363; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
364; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
365; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
366  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
367  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
368  ret <4 x float> %res
369}
370
371define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
372; SSE-LABEL: test_mm_cmpngt_ps:
373; SSE:       # %bb.0:
374; SSE-NEXT:    cmpnltps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x05]
375; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
376; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
377;
378; AVX1-LABEL: test_mm_cmpngt_ps:
379; AVX1:       # %bb.0:
380; AVX1-NEXT:    vcmpnltps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x05]
381; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
382;
383; AVX512-LABEL: test_mm_cmpngt_ps:
384; AVX512:       # %bb.0:
385; AVX512-NEXT:    vcmpnltps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x05]
386; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
387; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
388  %cmp = fcmp uge <4 x float> %a1, %a0
389  %sext = sext <4 x i1> %cmp to <4 x i32>
390  %res = bitcast <4 x i32> %sext to <4 x float>
391  ret <4 x float> %res
392}
393
394define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
395; SSE-LABEL: test_mm_cmpngt_ss:
396; SSE:       # %bb.0:
397; SSE-NEXT:    cmpnltss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x05]
398; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
399; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
400; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
401;
402; AVX-LABEL: test_mm_cmpngt_ss:
403; AVX:       # %bb.0:
404; AVX-NEXT:    vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05]
405; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
406; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
407; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
408  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
409  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
410  ret <4 x float> %res
411}
412
413define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
414; SSE-LABEL: test_mm_cmpnle_ps:
415; SSE:       # %bb.0:
416; SSE-NEXT:    cmpnleps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x06]
417; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
418;
419; AVX1-LABEL: test_mm_cmpnle_ps:
420; AVX1:       # %bb.0:
421; AVX1-NEXT:    vcmpnleps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x06]
422; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
423;
424; AVX512-LABEL: test_mm_cmpnle_ps:
425; AVX512:       # %bb.0:
426; AVX512-NEXT:    vcmpnleps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x06]
427; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
428; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
429  %cmp = fcmp ugt <4 x float> %a0, %a1
430  %sext = sext <4 x i1> %cmp to <4 x i32>
431  %res = bitcast <4 x i32> %sext to <4 x float>
432  ret <4 x float> %res
433}
434
435define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
436; SSE-LABEL: test_mm_cmpnle_ss:
437; SSE:       # %bb.0:
438; SSE-NEXT:    cmpnless %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x06]
439; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
440;
441; AVX-LABEL: test_mm_cmpnle_ss:
442; AVX:       # %bb.0:
443; AVX-NEXT:    vcmpnless %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x06]
444; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
445  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6)
446  ret <4 x float> %res
447}
448
449define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
450; SSE-LABEL: test_mm_cmpnlt_ps:
451; SSE:       # %bb.0:
452; SSE-NEXT:    cmpnltps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x05]
453; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
454;
455; AVX1-LABEL: test_mm_cmpnlt_ps:
456; AVX1:       # %bb.0:
457; AVX1-NEXT:    vcmpnltps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x05]
458; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
459;
460; AVX512-LABEL: test_mm_cmpnlt_ps:
461; AVX512:       # %bb.0:
462; AVX512-NEXT:    vcmpnltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x05]
463; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
464; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
465  %cmp = fcmp uge <4 x float> %a0, %a1
466  %sext = sext <4 x i1> %cmp to <4 x i32>
467  %res = bitcast <4 x i32> %sext to <4 x float>
468  ret <4 x float> %res
469}
470
471define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
472; SSE-LABEL: test_mm_cmpnlt_ss:
473; SSE:       # %bb.0:
474; SSE-NEXT:    cmpnltss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x05]
475; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
476;
477; AVX-LABEL: test_mm_cmpnlt_ss:
478; AVX:       # %bb.0:
479; AVX-NEXT:    vcmpnltss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x05]
480; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
481  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5)
482  ret <4 x float> %res
483}
484
485define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
486; SSE-LABEL: test_mm_cmpord_ps:
487; SSE:       # %bb.0:
488; SSE-NEXT:    cmpordps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x07]
489; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
490;
491; AVX1-LABEL: test_mm_cmpord_ps:
492; AVX1:       # %bb.0:
493; AVX1-NEXT:    vcmpordps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x07]
494; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
495;
496; AVX512-LABEL: test_mm_cmpord_ps:
497; AVX512:       # %bb.0:
498; AVX512-NEXT:    vcmpordps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x07]
499; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
500; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
501  %cmp = fcmp ord <4 x float> %a0, %a1
502  %sext = sext <4 x i1> %cmp to <4 x i32>
503  %res = bitcast <4 x i32> %sext to <4 x float>
504  ret <4 x float> %res
505}
506
507define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
508; SSE-LABEL: test_mm_cmpord_ss:
509; SSE:       # %bb.0:
510; SSE-NEXT:    cmpordss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x07]
511; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
512;
513; AVX-LABEL: test_mm_cmpord_ss:
514; AVX:       # %bb.0:
515; AVX-NEXT:    vcmpordss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x07]
516; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
517  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7)
518  ret <4 x float> %res
519}
520
521define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
522; SSE-LABEL: test_mm_cmpunord_ps:
523; SSE:       # %bb.0:
524; SSE-NEXT:    cmpunordps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x03]
525; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
526;
527; AVX1-LABEL: test_mm_cmpunord_ps:
528; AVX1:       # %bb.0:
529; AVX1-NEXT:    vcmpunordps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x03]
530; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
531;
532; AVX512-LABEL: test_mm_cmpunord_ps:
533; AVX512:       # %bb.0:
534; AVX512-NEXT:    vcmpunordps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x03]
535; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
536; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
537  %cmp = fcmp uno <4 x float> %a0, %a1
538  %sext = sext <4 x i1> %cmp to <4 x i32>
539  %res = bitcast <4 x i32> %sext to <4 x float>
540  ret <4 x float> %res
541}
542
543define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
544; SSE-LABEL: test_mm_cmpunord_ss:
545; SSE:       # %bb.0:
546; SSE-NEXT:    cmpunordss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x03]
547; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
548;
549; AVX-LABEL: test_mm_cmpunord_ss:
550; AVX:       # %bb.0:
551; AVX-NEXT:    vcmpunordss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x03]
552; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
553  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3)
554  ret <4 x float> %res
555}
556
557define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
558; SSE-LABEL: test_mm_comieq_ss:
559; SSE:       # %bb.0:
560; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
561; SSE-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
562; SSE-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
563; SSE-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
564; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
565; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
566;
567; AVX1-LABEL: test_mm_comieq_ss:
568; AVX1:       # %bb.0:
569; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
570; AVX1-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
571; AVX1-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
572; AVX1-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
573; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
574; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
575;
576; AVX512-LABEL: test_mm_comieq_ss:
577; AVX512:       # %bb.0:
578; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
579; AVX512-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
580; AVX512-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
581; AVX512-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
582; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
583; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
584  %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
585  ret i32 %res
586}
587declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
588
589define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
590; SSE-LABEL: test_mm_comige_ss:
591; SSE:       # %bb.0:
592; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
593; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
594; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
595; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
596;
597; AVX1-LABEL: test_mm_comige_ss:
598; AVX1:       # %bb.0:
599; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
600; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
601; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
602; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
603;
604; AVX512-LABEL: test_mm_comige_ss:
605; AVX512:       # %bb.0:
606; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
607; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
608; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
609; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
610  %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1)
611  ret i32 %res
612}
613declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
614
615define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
616; SSE-LABEL: test_mm_comigt_ss:
617; SSE:       # %bb.0:
618; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
619; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
620; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
621; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
622;
623; AVX1-LABEL: test_mm_comigt_ss:
624; AVX1:       # %bb.0:
625; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
626; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
627; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
628; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
629;
630; AVX512-LABEL: test_mm_comigt_ss:
631; AVX512:       # %bb.0:
632; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
633; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
634; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
635; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
636  %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1)
637  ret i32 %res
638}
639declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
640
641define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
642; SSE-LABEL: test_mm_comile_ss:
643; SSE:       # %bb.0:
644; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
645; SSE-NEXT:    comiss %xmm0, %xmm1 # encoding: [0x0f,0x2f,0xc8]
646; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
647; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
648;
649; AVX1-LABEL: test_mm_comile_ss:
650; AVX1:       # %bb.0:
651; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
652; AVX1-NEXT:    vcomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2f,0xc8]
653; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
654; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
655;
656; AVX512-LABEL: test_mm_comile_ss:
657; AVX512:       # %bb.0:
658; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
659; AVX512-NEXT:    vcomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
660; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
661; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
662  %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1)
663  ret i32 %res
664}
665declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
666
667define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
668; SSE-LABEL: test_mm_comilt_ss:
669; SSE:       # %bb.0:
670; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
671; SSE-NEXT:    comiss %xmm0, %xmm1 # encoding: [0x0f,0x2f,0xc8]
672; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
673; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
674;
675; AVX1-LABEL: test_mm_comilt_ss:
676; AVX1:       # %bb.0:
677; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
678; AVX1-NEXT:    vcomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2f,0xc8]
679; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
680; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
681;
682; AVX512-LABEL: test_mm_comilt_ss:
683; AVX512:       # %bb.0:
684; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
685; AVX512-NEXT:    vcomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
686; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
687; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
688  %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1)
689  ret i32 %res
690}
691declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
692
693define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
694; SSE-LABEL: test_mm_comineq_ss:
695; SSE:       # %bb.0:
696; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
697; SSE-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
698; SSE-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
699; SSE-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
700; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
701; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
702;
703; AVX1-LABEL: test_mm_comineq_ss:
704; AVX1:       # %bb.0:
705; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
706; AVX1-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
707; AVX1-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
708; AVX1-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
709; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
710; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
711;
712; AVX512-LABEL: test_mm_comineq_ss:
713; AVX512:       # %bb.0:
714; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
715; AVX512-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
716; AVX512-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
717; AVX512-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
718; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
719; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
720  %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1)
721  ret i32 %res
722}
723declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
724
725define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind {
726; SSE-LABEL: test_mm_cvt_ss2si:
727; SSE:       # %bb.0:
728; SSE-NEXT:    cvtss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2d,0xc0]
729; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
730;
731; AVX1-LABEL: test_mm_cvt_ss2si:
732; AVX1:       # %bb.0:
733; AVX1-NEXT:    vcvtss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2d,0xc0]
734; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
735;
736; AVX512-LABEL: test_mm_cvt_ss2si:
737; AVX512:       # %bb.0:
738; AVX512-NEXT:    vcvtss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0]
739; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
740  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
741  ret i32 %res
742}
743declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
744
745define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
746; X86-SSE-LABEL: test_mm_cvtsi32_ss:
747; X86-SSE:       # %bb.0:
748; X86-SSE-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x2a,0x44,0x24,0x04]
749; X86-SSE-NEXT:    retl # encoding: [0xc3]
750;
751; X86-AVX1-LABEL: test_mm_cvtsi32_ss:
752; X86-AVX1:       # %bb.0:
753; X86-AVX1-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x2a,0x44,0x24,0x04]
754; X86-AVX1-NEXT:    retl # encoding: [0xc3]
755;
756; X86-AVX512-LABEL: test_mm_cvtsi32_ss:
757; X86-AVX512:       # %bb.0:
758; X86-AVX512-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0x44,0x24,0x04]
759; X86-AVX512-NEXT:    retl # encoding: [0xc3]
760;
761; X64-SSE-LABEL: test_mm_cvtsi32_ss:
762; X64-SSE:       # %bb.0:
763; X64-SSE-NEXT:    cvtsi2ssl %edi, %xmm0 # encoding: [0xf3,0x0f,0x2a,0xc7]
764; X64-SSE-NEXT:    retq # encoding: [0xc3]
765;
766; X64-AVX1-LABEL: test_mm_cvtsi32_ss:
767; X64-AVX1:       # %bb.0:
768; X64-AVX1-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x2a,0xc7]
769; X64-AVX1-NEXT:    retq # encoding: [0xc3]
770;
771; X64-AVX512-LABEL: test_mm_cvtsi32_ss:
772; X64-AVX512:       # %bb.0:
773; X64-AVX512-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0xc7]
774; X64-AVX512-NEXT:    retq # encoding: [0xc3]
775  %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
776  ret <4 x float> %res
777}
778declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
779
780define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
781; X86-SSE-LABEL: test_mm_cvtss_f32:
782; X86-SSE:       # %bb.0:
783; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
784; X86-SSE-NEXT:    movss %xmm0, (%esp) # encoding: [0xf3,0x0f,0x11,0x04,0x24]
785; X86-SSE-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
786; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
787; X86-SSE-NEXT:    retl # encoding: [0xc3]
788;
789; X86-AVX1-LABEL: test_mm_cvtss_f32:
790; X86-AVX1:       # %bb.0:
791; X86-AVX1-NEXT:    pushl %eax # encoding: [0x50]
792; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) # encoding: [0xc5,0xfa,0x11,0x04,0x24]
793; X86-AVX1-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
794; X86-AVX1-NEXT:    popl %eax # encoding: [0x58]
795; X86-AVX1-NEXT:    retl # encoding: [0xc3]
796;
797; X86-AVX512-LABEL: test_mm_cvtss_f32:
798; X86-AVX512:       # %bb.0:
799; X86-AVX512-NEXT:    pushl %eax # encoding: [0x50]
800; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
801; X86-AVX512-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
802; X86-AVX512-NEXT:    popl %eax # encoding: [0x58]
803; X86-AVX512-NEXT:    retl # encoding: [0xc3]
804;
805; X64-LABEL: test_mm_cvtss_f32:
806; X64:       # %bb.0:
807; X64-NEXT:    retq # encoding: [0xc3]
808  %res = extractelement <4 x float> %a0, i32 0
809  ret float %res
810}
811
812define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind {
813; SSE-LABEL: test_mm_cvtss_si32:
814; SSE:       # %bb.0:
815; SSE-NEXT:    cvtss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2d,0xc0]
816; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
817;
818; AVX1-LABEL: test_mm_cvtss_si32:
819; AVX1:       # %bb.0:
820; AVX1-NEXT:    vcvtss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2d,0xc0]
821; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
822;
823; AVX512-LABEL: test_mm_cvtss_si32:
824; AVX512:       # %bb.0:
825; AVX512-NEXT:    vcvtss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0]
826; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
827  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
828  ret i32 %res
829}
830
831define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
832; SSE-LABEL: test_mm_cvttss_si:
833; SSE:       # %bb.0:
834; SSE-NEXT:    cvttss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2c,0xc0]
835; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
836;
837; AVX1-LABEL: test_mm_cvttss_si:
838; AVX1:       # %bb.0:
839; AVX1-NEXT:    vcvttss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2c,0xc0]
840; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
841;
842; AVX512-LABEL: test_mm_cvttss_si:
843; AVX512:       # %bb.0:
844; AVX512-NEXT:    vcvttss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0]
845; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
846  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
847  ret i32 %res
848}
849declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
850
851define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
852; SSE-LABEL: test_mm_cvttss_si32:
853; SSE:       # %bb.0:
854; SSE-NEXT:    cvttss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2c,0xc0]
855; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
856;
857; AVX1-LABEL: test_mm_cvttss_si32:
858; AVX1:       # %bb.0:
859; AVX1-NEXT:    vcvttss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2c,0xc0]
860; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
861;
862; AVX512-LABEL: test_mm_cvttss_si32:
863; AVX512:       # %bb.0:
864; AVX512-NEXT:    vcvttss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0]
865; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
866  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
867  ret i32 %res
868}
869
870define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
871; SSE-LABEL: test_mm_div_ps:
872; SSE:       # %bb.0:
873; SSE-NEXT:    divps %xmm1, %xmm0 # encoding: [0x0f,0x5e,0xc1]
874; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
875;
876; AVX1-LABEL: test_mm_div_ps:
877; AVX1:       # %bb.0:
878; AVX1-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5e,0xc1]
879; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
880;
881; AVX512-LABEL: test_mm_div_ps:
882; AVX512:       # %bb.0:
883; AVX512-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5e,0xc1]
884; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
885  %res = fdiv <4 x float> %a0, %a1
886  ret <4 x float> %res
887}
888
889define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
890; SSE-LABEL: test_mm_div_ss:
891; SSE:       # %bb.0:
892; SSE-NEXT:    divss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5e,0xc1]
893; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
894;
895; AVX1-LABEL: test_mm_div_ss:
896; AVX1:       # %bb.0:
897; AVX1-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5e,0xc1]
898; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
899;
900; AVX512-LABEL: test_mm_div_ss:
901; AVX512:       # %bb.0:
902; AVX512-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5e,0xc1]
903; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
904  %ext0 = extractelement <4 x float> %a0, i32 0
905  %ext1 = extractelement <4 x float> %a1, i32 0
906  %fdiv = fdiv float %ext0, %ext1
907  %res = insertelement <4 x float> %a0, float %fdiv, i32 0
908  ret <4 x float> %res
909}
910
911define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
912; X86-SSE-LABEL: test_MM_GET_EXCEPTION_MASK:
913; X86-SSE:       # %bb.0:
914; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
915; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
916; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
917; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
918; X86-SSE-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
919; X86-SSE-NEXT:    # imm = 0x1F80
920; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
921; X86-SSE-NEXT:    retl # encoding: [0xc3]
922;
923; X86-AVX-LABEL: test_MM_GET_EXCEPTION_MASK:
924; X86-AVX:       # %bb.0:
925; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
926; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
927; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
928; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
929; X86-AVX-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
930; X86-AVX-NEXT:    # imm = 0x1F80
931; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
932; X86-AVX-NEXT:    retl # encoding: [0xc3]
933;
934; X64-SSE-LABEL: test_MM_GET_EXCEPTION_MASK:
935; X64-SSE:       # %bb.0:
936; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
937; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
938; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
939; X64-SSE-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
940; X64-SSE-NEXT:    # imm = 0x1F80
941; X64-SSE-NEXT:    retq # encoding: [0xc3]
942;
943; X64-AVX-LABEL: test_MM_GET_EXCEPTION_MASK:
944; X64-AVX:       # %bb.0:
945; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
946; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
947; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
948; X64-AVX-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
949; X64-AVX-NEXT:    # imm = 0x1F80
950; X64-AVX-NEXT:    retq # encoding: [0xc3]
951  %1 = alloca i32, align 4
952  %2 = bitcast i32* %1 to i8*
953  call void @llvm.x86.sse.stmxcsr(i8* %2)
954  %3 = load i32, i32* %1, align 4
955  %4 = and i32 %3, 8064
956  ret i32 %4
957}
958declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone
959
960define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
961; X86-SSE-LABEL: test_MM_GET_EXCEPTION_STATE:
962; X86-SSE:       # %bb.0:
963; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
964; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
965; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
966; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
967; X86-SSE-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
968; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
969; X86-SSE-NEXT:    retl # encoding: [0xc3]
970;
971; X86-AVX-LABEL: test_MM_GET_EXCEPTION_STATE:
972; X86-AVX:       # %bb.0:
973; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
974; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
975; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
976; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
977; X86-AVX-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
978; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
979; X86-AVX-NEXT:    retl # encoding: [0xc3]
980;
981; X64-SSE-LABEL: test_MM_GET_EXCEPTION_STATE:
982; X64-SSE:       # %bb.0:
983; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
984; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
985; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
986; X64-SSE-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
987; X64-SSE-NEXT:    retq # encoding: [0xc3]
988;
989; X64-AVX-LABEL: test_MM_GET_EXCEPTION_STATE:
990; X64-AVX:       # %bb.0:
991; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
992; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
993; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
994; X64-AVX-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
995; X64-AVX-NEXT:    retq # encoding: [0xc3]
996  %1 = alloca i32, align 4
997  %2 = bitcast i32* %1 to i8*
998  call void @llvm.x86.sse.stmxcsr(i8* %2)
999  %3 = load i32, i32* %1, align 4
1000  %4 = and i32 %3, 63
1001  ret i32 %4
1002}
1003
1004define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
1005; X86-SSE-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1006; X86-SSE:       # %bb.0:
1007; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1008; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1009; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
1010; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1011; X86-SSE-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1012; X86-SSE-NEXT:    # imm = 0x8000
1013; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
1014; X86-SSE-NEXT:    retl # encoding: [0xc3]
1015;
1016; X86-AVX-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1017; X86-AVX:       # %bb.0:
1018; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1019; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1020; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
1021; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1022; X86-AVX-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1023; X86-AVX-NEXT:    # imm = 0x8000
1024; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
1025; X86-AVX-NEXT:    retl # encoding: [0xc3]
1026;
1027; X64-SSE-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1028; X64-SSE:       # %bb.0:
1029; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1030; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1031; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1032; X64-SSE-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1033; X64-SSE-NEXT:    # imm = 0x8000
1034; X64-SSE-NEXT:    retq # encoding: [0xc3]
1035;
1036; X64-AVX-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1037; X64-AVX:       # %bb.0:
1038; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1039; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1040; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1041; X64-AVX-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1042; X64-AVX-NEXT:    # imm = 0x8000
1043; X64-AVX-NEXT:    retq # encoding: [0xc3]
1044  %1 = alloca i32, align 4
1045  %2 = bitcast i32* %1 to i8*
1046  call void @llvm.x86.sse.stmxcsr(i8* %2)
1047  %3 = load i32, i32* %1, align 4
1048  %4 = and i32 %3, 32768
1049  ret i32 %4
1050}
1051
1052define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
1053; X86-SSE-LABEL: test_MM_GET_ROUNDING_MODE:
1054; X86-SSE:       # %bb.0:
1055; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1056; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1057; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
1058; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1059; X86-SSE-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1060; X86-SSE-NEXT:    # imm = 0x6000
1061; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
1062; X86-SSE-NEXT:    retl # encoding: [0xc3]
1063;
1064; X86-AVX-LABEL: test_MM_GET_ROUNDING_MODE:
1065; X86-AVX:       # %bb.0:
1066; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1067; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1068; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
1069; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1070; X86-AVX-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1071; X86-AVX-NEXT:    # imm = 0x6000
1072; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
1073; X86-AVX-NEXT:    retl # encoding: [0xc3]
1074;
1075; X64-SSE-LABEL: test_MM_GET_ROUNDING_MODE:
1076; X64-SSE:       # %bb.0:
1077; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1078; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1079; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1080; X64-SSE-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1081; X64-SSE-NEXT:    # imm = 0x6000
1082; X64-SSE-NEXT:    retq # encoding: [0xc3]
1083;
1084; X64-AVX-LABEL: test_MM_GET_ROUNDING_MODE:
1085; X64-AVX:       # %bb.0:
1086; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1087; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1088; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1089; X64-AVX-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1090; X64-AVX-NEXT:    # imm = 0x6000
1091; X64-AVX-NEXT:    retq # encoding: [0xc3]
1092  %1 = alloca i32, align 4
1093  %2 = bitcast i32* %1 to i8*
1094  call void @llvm.x86.sse.stmxcsr(i8* %2)
1095  %3 = load i32, i32* %1, align 4
1096  %4 = and i32 %3, 24576
1097  ret i32 %4
1098}
1099
1100define i32 @test_mm_getcsr() nounwind {
1101; X86-SSE-LABEL: test_mm_getcsr:
1102; X86-SSE:       # %bb.0:
1103; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1104; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1105; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
1106; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1107; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
1108; X86-SSE-NEXT:    retl # encoding: [0xc3]
1109;
1110; X86-AVX-LABEL: test_mm_getcsr:
1111; X86-AVX:       # %bb.0:
1112; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1113; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1114; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
1115; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1116; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
1117; X86-AVX-NEXT:    retl # encoding: [0xc3]
1118;
1119; X64-SSE-LABEL: test_mm_getcsr:
1120; X64-SSE:       # %bb.0:
1121; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1122; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1123; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1124; X64-SSE-NEXT:    retq # encoding: [0xc3]
1125;
1126; X64-AVX-LABEL: test_mm_getcsr:
1127; X64-AVX:       # %bb.0:
1128; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1129; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1130; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1131; X64-AVX-NEXT:    retq # encoding: [0xc3]
1132  %1 = alloca i32, align 4
1133  %2 = bitcast i32* %1 to i8*
1134  call void @llvm.x86.sse.stmxcsr(i8* %2)
1135  %3 = load i32, i32* %1, align 4
1136  ret i32 %3
1137}
1138
1139define <4 x float> @test_mm_load_ps(float* %a0) nounwind {
1140; X86-SSE-LABEL: test_mm_load_ps:
1141; X86-SSE:       # %bb.0:
1142; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1143; X86-SSE-NEXT:    movaps (%eax), %xmm0 # encoding: [0x0f,0x28,0x00]
1144; X86-SSE-NEXT:    retl # encoding: [0xc3]
1145;
1146; X86-AVX1-LABEL: test_mm_load_ps:
1147; X86-AVX1:       # %bb.0:
1148; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1149; X86-AVX1-NEXT:    vmovaps (%eax), %xmm0 # encoding: [0xc5,0xf8,0x28,0x00]
1150; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1151;
1152; X86-AVX512-LABEL: test_mm_load_ps:
1153; X86-AVX512:       # %bb.0:
1154; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1155; X86-AVX512-NEXT:    vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00]
1156; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1157;
1158; X64-SSE-LABEL: test_mm_load_ps:
1159; X64-SSE:       # %bb.0:
1160; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
1161; X64-SSE-NEXT:    retq # encoding: [0xc3]
1162;
1163; X64-AVX1-LABEL: test_mm_load_ps:
1164; X64-AVX1:       # %bb.0:
1165; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07]
1166; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1167;
1168; X64-AVX512-LABEL: test_mm_load_ps:
1169; X64-AVX512:       # %bb.0:
1170; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
1171; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1172  %arg0 = bitcast float* %a0 to <4 x float>*
1173  %res = load <4 x float>, <4 x float>* %arg0, align 16
1174  ret <4 x float> %res
1175}
1176
1177define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
1178; X86-SSE-LABEL: test_mm_load_ps1:
1179; X86-SSE:       # %bb.0:
1180; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1181; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
1182; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1183; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1184; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1185; X86-SSE-NEXT:    retl # encoding: [0xc3]
1186;
1187; X86-AVX1-LABEL: test_mm_load_ps1:
1188; X86-AVX1:       # %bb.0:
1189; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1190; X86-AVX1-NEXT:    vbroadcastss (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x00]
1191; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1192;
1193; X86-AVX512-LABEL: test_mm_load_ps1:
1194; X86-AVX512:       # %bb.0:
1195; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1196; X86-AVX512-NEXT:    vbroadcastss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x00]
1197; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1198;
1199; X64-SSE-LABEL: test_mm_load_ps1:
1200; X64-SSE:       # %bb.0:
1201; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
1202; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1203; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1204; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1205; X64-SSE-NEXT:    retq # encoding: [0xc3]
1206;
1207; X64-AVX1-LABEL: test_mm_load_ps1:
1208; X64-AVX1:       # %bb.0:
1209; X64-AVX1-NEXT:    vbroadcastss (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x07]
1210; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1211;
1212; X64-AVX512-LABEL: test_mm_load_ps1:
1213; X64-AVX512:       # %bb.0:
1214; X64-AVX512-NEXT:    vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07]
1215; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1216  %ld = load float, float* %a0, align 4
1217  %res0 = insertelement <4 x float> undef, float %ld, i32 0
1218  %res1 = insertelement <4 x float> %res0, float %ld, i32 1
1219  %res2 = insertelement <4 x float> %res1, float %ld, i32 2
1220  %res3 = insertelement <4 x float> %res2, float %ld, i32 3
1221  ret <4 x float> %res3
1222}
1223
1224define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
1225; X86-SSE-LABEL: test_mm_load_ss:
1226; X86-SSE:       # %bb.0:
1227; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1228; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
1229; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1230; X86-SSE-NEXT:    retl # encoding: [0xc3]
1231;
1232; X86-AVX1-LABEL: test_mm_load_ss:
1233; X86-AVX1:       # %bb.0:
1234; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1235; X86-AVX1-NEXT:    vmovss (%eax), %xmm0 # encoding: [0xc5,0xfa,0x10,0x00]
1236; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
1237; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1238;
1239; X86-AVX512-LABEL: test_mm_load_ss:
1240; X86-AVX512:       # %bb.0:
1241; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1242; X86-AVX512-NEXT:    vmovss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00]
1243; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
1244; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1245;
1246; X64-SSE-LABEL: test_mm_load_ss:
1247; X64-SSE:       # %bb.0:
1248; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
1249; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1250; X64-SSE-NEXT:    retq # encoding: [0xc3]
1251;
1252; X64-AVX1-LABEL: test_mm_load_ss:
1253; X64-AVX1:       # %bb.0:
1254; X64-AVX1-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
1255; X64-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
1256; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1257;
1258; X64-AVX512-LABEL: test_mm_load_ss:
1259; X64-AVX512:       # %bb.0:
1260; X64-AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
1261; X64-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
1262; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1263  %ld = load float, float* %a0, align 1
1264  %res0 = insertelement <4 x float> undef, float %ld, i32 0
1265  %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
1266  %res2 = insertelement <4 x float> %res1, float 0.0, i32 2
1267  %res3 = insertelement <4 x float> %res2, float 0.0, i32 3
1268  ret <4 x float> %res3
1269}
1270
1271define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
1272; X86-SSE-LABEL: test_mm_load1_ps:
1273; X86-SSE:       # %bb.0:
1274; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1275; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
1276; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1277; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1278; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1279; X86-SSE-NEXT:    retl # encoding: [0xc3]
1280;
1281; X86-AVX1-LABEL: test_mm_load1_ps:
1282; X86-AVX1:       # %bb.0:
1283; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1284; X86-AVX1-NEXT:    vbroadcastss (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x00]
1285; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1286;
1287; X86-AVX512-LABEL: test_mm_load1_ps:
1288; X86-AVX512:       # %bb.0:
1289; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1290; X86-AVX512-NEXT:    vbroadcastss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x00]
1291; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1292;
1293; X64-SSE-LABEL: test_mm_load1_ps:
1294; X64-SSE:       # %bb.0:
1295; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
1296; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1297; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1298; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1299; X64-SSE-NEXT:    retq # encoding: [0xc3]
1300;
1301; X64-AVX1-LABEL: test_mm_load1_ps:
1302; X64-AVX1:       # %bb.0:
1303; X64-AVX1-NEXT:    vbroadcastss (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x07]
1304; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1305;
1306; X64-AVX512-LABEL: test_mm_load1_ps:
1307; X64-AVX512:       # %bb.0:
1308; X64-AVX512-NEXT:    vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07]
1309; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1310  %ld = load float, float* %a0, align 4
1311  %res0 = insertelement <4 x float> undef, float %ld, i32 0
1312  %res1 = insertelement <4 x float> %res0, float %ld, i32 1
1313  %res2 = insertelement <4 x float> %res1, float %ld, i32 2
1314  %res3 = insertelement <4 x float> %res2, float %ld, i32 3
1315  ret <4 x float> %res3
1316}
1317
1318define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
1319; X86-SSE-LABEL: test_mm_loadh_pi:
1320; X86-SSE:       # %bb.0:
1321; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1322; X86-SSE-NEXT:    movss (%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x08]
1323; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
1324; X86-SSE-NEXT:    movss 4(%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x50,0x04]
1325; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
1326; X86-SSE-NEXT:    shufps $0, %xmm1, %xmm2 # encoding: [0x0f,0xc6,0xd1,0x00]
1327; X86-SSE-NEXT:    # xmm2 = xmm2[0,0],xmm1[0,0]
1328; X86-SSE-NEXT:    shufps $36, %xmm2, %xmm0 # encoding: [0x0f,0xc6,0xc2,0x24]
1329; X86-SSE-NEXT:    # xmm0 = xmm0[0,1],xmm2[2,0]
1330; X86-SSE-NEXT:    retl # encoding: [0xc3]
1331;
1332; X86-AVX1-LABEL: test_mm_loadh_pi:
1333; X86-AVX1:       # %bb.0:
1334; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1335; X86-AVX1-NEXT:    vmovhpd (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x16,0x00]
1336; X86-AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
1337; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1338;
1339; X86-AVX512-LABEL: test_mm_loadh_pi:
1340; X86-AVX512:       # %bb.0:
1341; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1342; X86-AVX512-NEXT:    vmovhpd (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x16,0x00]
1343; X86-AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
1344; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1345;
1346; X64-SSE-LABEL: test_mm_loadh_pi:
1347; X64-SSE:       # %bb.0:
1348; X64-SSE-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
1349; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xf8]
1350; X64-SSE-NEXT:    shrq $32, %rax # encoding: [0x48,0xc1,0xe8,0x20]
1351; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xfc]
1352; X64-SSE-NEXT:    movss -{{[0-9]+}}(%rsp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0xf8]
1353; X64-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
1354; X64-SSE-NEXT:    movss -{{[0-9]+}}(%rsp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0xfc]
1355; X64-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
1356; X64-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
1357; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1358; X64-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
1359; X64-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1360; X64-SSE-NEXT:    retq # encoding: [0xc3]
1361;
1362; X64-AVX1-LABEL: test_mm_loadh_pi:
1363; X64-AVX1:       # %bb.0:
1364; X64-AVX1-NEXT:    vmovhpd (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x16,0x07]
1365; X64-AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
1366; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1367;
1368; X64-AVX512-LABEL: test_mm_loadh_pi:
1369; X64-AVX512:       # %bb.0:
1370; X64-AVX512-NEXT:    vmovhpd (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x16,0x07]
1371; X64-AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
1372; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1373  %ptr = bitcast x86_mmx* %a1 to <2 x float>*
1374  %ld  = load <2 x float>, <2 x float>* %ptr
1375  %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1376  %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1377  ret <4 x float> %res
1378}
1379
1380define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
1381; X86-SSE-LABEL: test_mm_loadl_pi:
1382; X86-SSE:       # %bb.0:
1383; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1384; X86-SSE-NEXT:    movss (%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x10]
1385; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
1386; X86-SSE-NEXT:    movss 4(%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x48,0x04]
1387; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
1388; X86-SSE-NEXT:    shufps $0, %xmm2, %xmm1 # encoding: [0x0f,0xc6,0xca,0x00]
1389; X86-SSE-NEXT:    # xmm1 = xmm1[0,0],xmm2[0,0]
1390; X86-SSE-NEXT:    shufps $226, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe2]
1391; X86-SSE-NEXT:    # xmm1 = xmm1[2,0],xmm0[2,3]
1392; X86-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
1393; X86-SSE-NEXT:    retl # encoding: [0xc3]
1394;
1395; X86-AVX1-LABEL: test_mm_loadl_pi:
1396; X86-AVX1:       # %bb.0:
1397; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1398; X86-AVX1-NEXT:    vmovlpd (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x12,0x00]
1399; X86-AVX1-NEXT:    # xmm0 = mem[0],xmm0[1]
1400; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1401;
1402; X86-AVX512-LABEL: test_mm_loadl_pi:
1403; X86-AVX512:       # %bb.0:
1404; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1405; X86-AVX512-NEXT:    vmovlpd (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x12,0x00]
1406; X86-AVX512-NEXT:    # xmm0 = mem[0],xmm0[1]
1407; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1408;
1409; X64-SSE-LABEL: test_mm_loadl_pi:
1410; X64-SSE:       # %bb.0:
1411; X64-SSE-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
1412; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xf8]
1413; X64-SSE-NEXT:    shrq $32, %rax # encoding: [0x48,0xc1,0xe8,0x20]
1414; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xfc]
1415; X64-SSE-NEXT:    movss -{{[0-9]+}}(%rsp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0xf8]
1416; X64-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
1417; X64-SSE-NEXT:    movss -{{[0-9]+}}(%rsp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0xfc]
1418; X64-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
1419; X64-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
1420; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1421; X64-SSE-NEXT:    shufps $228, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe4]
1422; X64-SSE-NEXT:    # xmm1 = xmm1[0,1],xmm0[2,3]
1423; X64-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
1424; X64-SSE-NEXT:    retq # encoding: [0xc3]
1425;
1426; X64-AVX1-LABEL: test_mm_loadl_pi:
1427; X64-AVX1:       # %bb.0:
1428; X64-AVX1-NEXT:    vmovlpd (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x12,0x07]
1429; X64-AVX1-NEXT:    # xmm0 = mem[0],xmm0[1]
1430; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1431;
1432; X64-AVX512-LABEL: test_mm_loadl_pi:
1433; X64-AVX512:       # %bb.0:
1434; X64-AVX512-NEXT:    vmovlpd (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x12,0x07]
1435; X64-AVX512-NEXT:    # xmm0 = mem[0],xmm0[1]
1436; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1437  %ptr = bitcast x86_mmx* %a1 to <2 x float>*
1438  %ld  = load <2 x float>, <2 x float>* %ptr
1439  %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1440  %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1441  ret <4 x float> %res
1442}
1443
1444define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
1445; X86-SSE-LABEL: test_mm_loadr_ps:
1446; X86-SSE:       # %bb.0:
1447; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1448; X86-SSE-NEXT:    movaps (%eax), %xmm0 # encoding: [0x0f,0x28,0x00]
1449; X86-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
1450; X86-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
1451; X86-SSE-NEXT:    retl # encoding: [0xc3]
1452;
1453; X86-AVX1-LABEL: test_mm_loadr_ps:
1454; X86-AVX1:       # %bb.0:
1455; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1456; X86-AVX1-NEXT:    vpermilps $27, (%eax), %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0x00,0x1b]
1457; X86-AVX1-NEXT:    # xmm0 = mem[3,2,1,0]
1458; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1459;
1460; X86-AVX512-LABEL: test_mm_loadr_ps:
1461; X86-AVX512:       # %bb.0:
1462; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1463; X86-AVX512-NEXT:    vpermilps $27, (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x00,0x1b]
1464; X86-AVX512-NEXT:    # xmm0 = mem[3,2,1,0]
1465; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1466;
1467; X64-SSE-LABEL: test_mm_loadr_ps:
1468; X64-SSE:       # %bb.0:
1469; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
1470; X64-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
1471; X64-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
1472; X64-SSE-NEXT:    retq # encoding: [0xc3]
1473;
1474; X64-AVX1-LABEL: test_mm_loadr_ps:
1475; X64-AVX1:       # %bb.0:
1476; X64-AVX1-NEXT:    vpermilps $27, (%rdi), %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0x07,0x1b]
1477; X64-AVX1-NEXT:    # xmm0 = mem[3,2,1,0]
1478; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1479;
1480; X64-AVX512-LABEL: test_mm_loadr_ps:
1481; X64-AVX512:       # %bb.0:
1482; X64-AVX512-NEXT:    vpermilps $27, (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x07,0x1b]
1483; X64-AVX512-NEXT:    # xmm0 = mem[3,2,1,0]
1484; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1485  %arg0 = bitcast float* %a0 to <4 x float>*
1486  %ld = load <4 x float>, <4 x float>* %arg0, align 16
1487  %res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1488  ret <4 x float> %res
1489}
1490
1491define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind {
1492; X86-SSE-LABEL: test_mm_loadu_ps:
1493; X86-SSE:       # %bb.0:
1494; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1495; X86-SSE-NEXT:    movups (%eax), %xmm0 # encoding: [0x0f,0x10,0x00]
1496; X86-SSE-NEXT:    retl # encoding: [0xc3]
1497;
1498; X86-AVX1-LABEL: test_mm_loadu_ps:
1499; X86-AVX1:       # %bb.0:
1500; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1501; X86-AVX1-NEXT:    vmovups (%eax), %xmm0 # encoding: [0xc5,0xf8,0x10,0x00]
1502; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1503;
1504; X86-AVX512-LABEL: test_mm_loadu_ps:
1505; X86-AVX512:       # %bb.0:
1506; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1507; X86-AVX512-NEXT:    vmovups (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x00]
1508; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1509;
1510; X64-SSE-LABEL: test_mm_loadu_ps:
1511; X64-SSE:       # %bb.0:
1512; X64-SSE-NEXT:    movups (%rdi), %xmm0 # encoding: [0x0f,0x10,0x07]
1513; X64-SSE-NEXT:    retq # encoding: [0xc3]
1514;
1515; X64-AVX1-LABEL: test_mm_loadu_ps:
1516; X64-AVX1:       # %bb.0:
1517; X64-AVX1-NEXT:    vmovups (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x10,0x07]
1518; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1519;
1520; X64-AVX512-LABEL: test_mm_loadu_ps:
1521; X64-AVX512:       # %bb.0:
1522; X64-AVX512-NEXT:    vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
1523; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1524  %arg0 = bitcast float* %a0 to <4 x float>*
1525  %res = load <4 x float>, <4 x float>* %arg0, align 1
1526  ret <4 x float> %res
1527}
1528
1529define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) {
1530; SSE-LABEL: test_mm_max_ps:
1531; SSE:       # %bb.0:
1532; SSE-NEXT:    maxps %xmm1, %xmm0 # encoding: [0x0f,0x5f,0xc1]
1533; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1534;
1535; AVX1-LABEL: test_mm_max_ps:
1536; AVX1:       # %bb.0:
1537; AVX1-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5f,0xc1]
1538; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1539;
1540; AVX512-LABEL: test_mm_max_ps:
1541; AVX512:       # %bb.0:
1542; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
1543; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1544  %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
1545  ret <4 x float> %res
1546}
1547declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
1548
1549define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) {
1550; SSE-LABEL: test_mm_max_ss:
1551; SSE:       # %bb.0:
1552; SSE-NEXT:    maxss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5f,0xc1]
1553; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1554;
1555; AVX1-LABEL: test_mm_max_ss:
1556; AVX1:       # %bb.0:
1557; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5f,0xc1]
1558; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1559;
1560; AVX512-LABEL: test_mm_max_ss:
1561; AVX512:       # %bb.0:
1562; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5f,0xc1]
1563; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1564  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
1565  ret <4 x float> %res
1566}
1567declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
1568
1569define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) {
1570; SSE-LABEL: test_mm_min_ps:
1571; SSE:       # %bb.0:
1572; SSE-NEXT:    minps %xmm1, %xmm0 # encoding: [0x0f,0x5d,0xc1]
1573; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1574;
1575; AVX1-LABEL: test_mm_min_ps:
1576; AVX1:       # %bb.0:
1577; AVX1-NEXT:    vminps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5d,0xc1]
1578; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1579;
1580; AVX512-LABEL: test_mm_min_ps:
1581; AVX512:       # %bb.0:
1582; AVX512-NEXT:    vminps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1]
1583; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1584  %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
1585  ret <4 x float> %res
1586}
1587declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
1588
1589define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) {
1590; SSE-LABEL: test_mm_min_ss:
1591; SSE:       # %bb.0:
1592; SSE-NEXT:    minss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5d,0xc1]
1593; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1594;
1595; AVX1-LABEL: test_mm_min_ss:
1596; AVX1:       # %bb.0:
1597; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5d,0xc1]
1598; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1599;
1600; AVX512-LABEL: test_mm_min_ss:
1601; AVX512:       # %bb.0:
1602; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5d,0xc1]
1603; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1604  %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
1605  ret <4 x float> %res
1606}
1607declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
1608
1609define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
1610; SSE-LABEL: test_mm_move_ss:
1611; SSE:       # %bb.0:
1612; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
1613; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
1614; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1615;
1616; AVX-LABEL: test_mm_move_ss:
1617; AVX:       # %bb.0:
1618; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
1619; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
1620; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1621  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1622  ret <4 x float> %res
1623}
1624
1625define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) {
1626; SSE-LABEL: test_mm_movehl_ps:
1627; SSE:       # %bb.0:
1628; SSE-NEXT:    movhlps %xmm1, %xmm0 # encoding: [0x0f,0x12,0xc1]
1629; SSE-NEXT:    # xmm0 = xmm1[1],xmm0[1]
1630; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1631;
1632; AVX1-LABEL: test_mm_movehl_ps:
1633; AVX1:       # %bb.0:
1634; AVX1-NEXT:    vunpckhpd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0x15,0xc0]
1635; AVX1-NEXT:    # xmm0 = xmm1[1],xmm0[1]
1636; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1637;
1638; AVX512-LABEL: test_mm_movehl_ps:
1639; AVX512:       # %bb.0:
1640; AVX512-NEXT:    vunpckhpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x15,0xc0]
1641; AVX512-NEXT:    # xmm0 = xmm1[1],xmm0[1]
1642; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1643  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
1644  ret <4 x float> %res
1645}
1646
1647define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) {
1648; SSE-LABEL: test_mm_movelh_ps:
1649; SSE:       # %bb.0:
1650; SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
1651; SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1652; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1653;
1654; AVX1-LABEL: test_mm_movelh_ps:
1655; AVX1:       # %bb.0:
1656; AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1]
1657; AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1658; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1659;
1660; AVX512-LABEL: test_mm_movelh_ps:
1661; AVX512:       # %bb.0:
1662; AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1]
1663; AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1664; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1665  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1666  ret <4 x float> %res
1667}
1668
1669define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind {
1670; SSE-LABEL: test_mm_movemask_ps:
1671; SSE:       # %bb.0:
1672; SSE-NEXT:    movmskps %xmm0, %eax # encoding: [0x0f,0x50,0xc0]
1673; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1674;
1675; AVX-LABEL: test_mm_movemask_ps:
1676; AVX:       # %bb.0:
1677; AVX-NEXT:    vmovmskps %xmm0, %eax # encoding: [0xc5,0xf8,0x50,0xc0]
1678; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1679  %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
1680  ret i32 %res
1681}
1682declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
1683
1684define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
1685; SSE-LABEL: test_mm_mul_ps:
1686; SSE:       # %bb.0:
1687; SSE-NEXT:    mulps %xmm1, %xmm0 # encoding: [0x0f,0x59,0xc1]
1688; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1689;
1690; AVX1-LABEL: test_mm_mul_ps:
1691; AVX1:       # %bb.0:
1692; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x59,0xc1]
1693; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1694;
1695; AVX512-LABEL: test_mm_mul_ps:
1696; AVX512:       # %bb.0:
1697; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x59,0xc1]
1698; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1699  %res = fmul <4 x float> %a0, %a1
1700  ret <4 x float> %res
1701}
1702
1703define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
1704; SSE-LABEL: test_mm_mul_ss:
1705; SSE:       # %bb.0:
1706; SSE-NEXT:    mulss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x59,0xc1]
1707; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1708;
1709; AVX1-LABEL: test_mm_mul_ss:
1710; AVX1:       # %bb.0:
1711; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x59,0xc1]
1712; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1713;
1714; AVX512-LABEL: test_mm_mul_ss:
1715; AVX512:       # %bb.0:
1716; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x59,0xc1]
1717; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1718  %ext0 = extractelement <4 x float> %a0, i32 0
1719  %ext1 = extractelement <4 x float> %a1, i32 0
1720  %fmul = fmul float %ext0, %ext1
1721  %res = insertelement <4 x float> %a0, float %fmul, i32 0
1722  ret <4 x float> %res
1723}
1724
1725define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
1726; SSE-LABEL: test_mm_or_ps:
1727; SSE:       # %bb.0:
1728; SSE-NEXT:    orps %xmm1, %xmm0 # encoding: [0x0f,0x56,0xc1]
1729; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1730;
1731; AVX1-LABEL: test_mm_or_ps:
1732; AVX1:       # %bb.0:
1733; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x56,0xc1]
1734; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1735;
1736; AVX512-LABEL: test_mm_or_ps:
1737; AVX512:       # %bb.0:
1738; AVX512-NEXT:    vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1]
1739; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1740  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
1741  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
1742  %res = or <4 x i32> %arg0, %arg1
1743  %bc = bitcast <4 x i32> %res to <4 x float>
1744  ret <4 x float> %bc
1745}
1746
1747define void @test_mm_prefetch(i8* %a0) {
1748; X86-LABEL: test_mm_prefetch:
1749; X86:       # %bb.0:
1750; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1751; X86-NEXT:    prefetchnta (%eax) # encoding: [0x0f,0x18,0x00]
1752; X86-NEXT:    retl # encoding: [0xc3]
1753;
1754; X64-LABEL: test_mm_prefetch:
1755; X64:       # %bb.0:
1756; X64-NEXT:    prefetchnta (%rdi) # encoding: [0x0f,0x18,0x07]
1757; X64-NEXT:    retq # encoding: [0xc3]
1758  call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
1759  ret void
1760}
1761declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone
1762
1763define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) {
1764; SSE-LABEL: test_mm_rcp_ps:
1765; SSE:       # %bb.0:
1766; SSE-NEXT:    rcpps %xmm0, %xmm0 # encoding: [0x0f,0x53,0xc0]
1767; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1768;
1769; AVX-LABEL: test_mm_rcp_ps:
1770; AVX:       # %bb.0:
1771; AVX-NEXT:    vrcpps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x53,0xc0]
1772; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1773  %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
1774  ret <4 x float> %res
1775}
1776declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
1777
1778define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) {
1779; SSE-LABEL: test_mm_rcp_ss:
1780; SSE:       # %bb.0:
1781; SSE-NEXT:    rcpss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x53,0xc0]
1782; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1783;
1784; AVX-LABEL: test_mm_rcp_ss:
1785; AVX:       # %bb.0:
1786; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x53,0xc0]
1787; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1788  %rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
1789  ret <4 x float> %rcp
1790}
1791declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
1792
1793define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) {
1794; SSE-LABEL: test_mm_rsqrt_ps:
1795; SSE:       # %bb.0:
1796; SSE-NEXT:    rsqrtps %xmm0, %xmm0 # encoding: [0x0f,0x52,0xc0]
1797; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1798;
1799; AVX-LABEL: test_mm_rsqrt_ps:
1800; AVX:       # %bb.0:
1801; AVX-NEXT:    vrsqrtps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x52,0xc0]
1802; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1803  %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
1804  ret <4 x float> %res
1805}
1806declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
1807
1808define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) {
1809; SSE-LABEL: test_mm_rsqrt_ss:
1810; SSE:       # %bb.0:
1811; SSE-NEXT:    rsqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x52,0xc0]
1812; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1813;
1814; AVX-LABEL: test_mm_rsqrt_ss:
1815; AVX:       # %bb.0:
1816; AVX-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x52,0xc0]
1817; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1818  %rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
1819  ret <4 x float> %rsqrt
1820}
1821declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
1822
1823define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
1824; X86-SSE-LABEL: test_MM_SET_EXCEPTION_MASK:
1825; X86-SSE:       # %bb.0:
1826; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1827; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1828; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1829; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
1830; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1831; X86-SSE-NEXT:    andl $-8065, %edx # encoding: [0x81,0xe2,0x7f,0xe0,0xff,0xff]
1832; X86-SSE-NEXT:    # imm = 0xE07F
1833; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1834; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1835; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
1836; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
1837; X86-SSE-NEXT:    retl # encoding: [0xc3]
1838;
1839; X86-AVX-LABEL: test_MM_SET_EXCEPTION_MASK:
1840; X86-AVX:       # %bb.0:
1841; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1842; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1843; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1844; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
1845; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1846; X86-AVX-NEXT:    andl $-8065, %edx # encoding: [0x81,0xe2,0x7f,0xe0,0xff,0xff]
1847; X86-AVX-NEXT:    # imm = 0xE07F
1848; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1849; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1850; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
1851; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
1852; X86-AVX-NEXT:    retl # encoding: [0xc3]
1853;
1854; X64-SSE-LABEL: test_MM_SET_EXCEPTION_MASK:
1855; X64-SSE:       # %bb.0:
1856; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1857; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1858; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1859; X64-SSE-NEXT:    andl $-8065, %ecx # encoding: [0x81,0xe1,0x7f,0xe0,0xff,0xff]
1860; X64-SSE-NEXT:    # imm = 0xE07F
1861; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1862; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1863; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
1864; X64-SSE-NEXT:    retq # encoding: [0xc3]
1865;
1866; X64-AVX-LABEL: test_MM_SET_EXCEPTION_MASK:
1867; X64-AVX:       # %bb.0:
1868; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1869; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1870; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1871; X64-AVX-NEXT:    andl $-8065, %ecx # encoding: [0x81,0xe1,0x7f,0xe0,0xff,0xff]
1872; X64-AVX-NEXT:    # imm = 0xE07F
1873; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1874; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1875; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
1876; X64-AVX-NEXT:    retq # encoding: [0xc3]
1877  %1 = alloca i32, align 4
1878  %2 = bitcast i32* %1 to i8*
1879  call void @llvm.x86.sse.stmxcsr(i8* %2)
1880  %3 = load i32, i32* %1
1881  %4 = and i32 %3, -8065
1882  %5 = or i32 %4, %a0
1883  store i32 %5, i32* %1
1884  call void @llvm.x86.sse.ldmxcsr(i8* %2)
1885  ret void
1886}
1887declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone
1888
1889define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
1890; X86-SSE-LABEL: test_MM_SET_EXCEPTION_STATE:
1891; X86-SSE:       # %bb.0:
1892; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1893; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1894; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1895; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
1896; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1897; X86-SSE-NEXT:    andl $-64, %edx # encoding: [0x83,0xe2,0xc0]
1898; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1899; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1900; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
1901; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
1902; X86-SSE-NEXT:    retl # encoding: [0xc3]
1903;
1904; X86-AVX-LABEL: test_MM_SET_EXCEPTION_STATE:
1905; X86-AVX:       # %bb.0:
1906; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1907; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1908; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1909; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
1910; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1911; X86-AVX-NEXT:    andl $-64, %edx # encoding: [0x83,0xe2,0xc0]
1912; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1913; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1914; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
1915; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
1916; X86-AVX-NEXT:    retl # encoding: [0xc3]
1917;
1918; X64-SSE-LABEL: test_MM_SET_EXCEPTION_STATE:
1919; X64-SSE:       # %bb.0:
1920; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1921; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1922; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1923; X64-SSE-NEXT:    andl $-64, %ecx # encoding: [0x83,0xe1,0xc0]
1924; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1925; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1926; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
1927; X64-SSE-NEXT:    retq # encoding: [0xc3]
1928;
1929; X64-AVX-LABEL: test_MM_SET_EXCEPTION_STATE:
1930; X64-AVX:       # %bb.0:
1931; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1932; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1933; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1934; X64-AVX-NEXT:    andl $-64, %ecx # encoding: [0x83,0xe1,0xc0]
1935; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1936; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1937; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
1938; X64-AVX-NEXT:    retq # encoding: [0xc3]
1939  %1 = alloca i32, align 4
1940  %2 = bitcast i32* %1 to i8*
1941  call void @llvm.x86.sse.stmxcsr(i8* %2)
1942  %3 = load i32, i32* %1
1943  %4 = and i32 %3, -64
1944  %5 = or i32 %4, %a0
1945  store i32 %5, i32* %1
1946  call void @llvm.x86.sse.ldmxcsr(i8* %2)
1947  ret void
1948}
1949
1950define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
1951; X86-SSE-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1952; X86-SSE:       # %bb.0:
1953; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1954; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1955; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1956; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
1957; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1958; X86-SSE-NEXT:    andl $-32769, %edx # encoding: [0x81,0xe2,0xff,0x7f,0xff,0xff]
1959; X86-SSE-NEXT:    # imm = 0xFFFF7FFF
1960; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1961; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1962; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
1963; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
1964; X86-SSE-NEXT:    retl # encoding: [0xc3]
1965;
1966; X86-AVX-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1967; X86-AVX:       # %bb.0:
1968; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1969; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1970; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1971; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
1972; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1973; X86-AVX-NEXT:    andl $-32769, %edx # encoding: [0x81,0xe2,0xff,0x7f,0xff,0xff]
1974; X86-AVX-NEXT:    # imm = 0xFFFF7FFF
1975; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1976; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1977; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
1978; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
1979; X86-AVX-NEXT:    retl # encoding: [0xc3]
1980;
1981; X64-SSE-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1982; X64-SSE:       # %bb.0:
1983; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1984; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1985; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1986; X64-SSE-NEXT:    andl $-32769, %ecx # encoding: [0x81,0xe1,0xff,0x7f,0xff,0xff]
1987; X64-SSE-NEXT:    # imm = 0xFFFF7FFF
1988; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1989; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1990; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
1991; X64-SSE-NEXT:    retq # encoding: [0xc3]
1992;
1993; X64-AVX-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1994; X64-AVX:       # %bb.0:
1995; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1996; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1997; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1998; X64-AVX-NEXT:    andl $-32769, %ecx # encoding: [0x81,0xe1,0xff,0x7f,0xff,0xff]
1999; X64-AVX-NEXT:    # imm = 0xFFFF7FFF
2000; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
2001; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
2002; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
2003; X64-AVX-NEXT:    retq # encoding: [0xc3]
2004  %1 = alloca i32, align 4
2005  %2 = bitcast i32* %1 to i8*
2006  call void @llvm.x86.sse.stmxcsr(i8* %2)
2007  %3 = load i32, i32* %1
2008  %4 = and i32 %3, -32769
2009  %5 = or i32 %4, %a0
2010  store i32 %5, i32* %1
2011  call void @llvm.x86.sse.ldmxcsr(i8* %2)
2012  ret void
2013}
2014
2015define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
2016; X86-SSE-LABEL: test_mm_set_ps:
2017; X86-SSE:       # %bb.0:
2018; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
2019; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2020; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
2021; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
2022; X86-SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
2023; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2024; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x08]
2025; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
2026; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x04]
2027; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
2028; X86-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
2029; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2030; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
2031; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
2032; X86-SSE-NEXT:    retl # encoding: [0xc3]
2033;
2034; X86-AVX1-LABEL: test_mm_set_ps:
2035; X86-AVX1:       # %bb.0:
2036; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2037; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2038; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2039; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2040; X86-AVX1-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2041; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2042; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
2043; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2044; X86-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20]
2045; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2046; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
2047; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2048; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
2049; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[0]
2050; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2051;
2052; X86-AVX512-LABEL: test_mm_set_ps:
2053; X86-AVX512:       # %bb.0:
2054; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2055; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2056; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2057; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
2058; X86-AVX512-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2059; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2060; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
2061; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
2062; X86-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20]
2063; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2064; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
2065; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
2066; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
2067; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[0]
2068; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2069;
2070; X64-SSE-LABEL: test_mm_set_ps:
2071; X64-SSE:       # %bb.0:
2072; X64-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
2073; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2074; X64-SSE-NEXT:    unpcklps %xmm2, %xmm3 # encoding: [0x0f,0x14,0xda]
2075; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2076; X64-SSE-NEXT:    movlhps %xmm1, %xmm3 # encoding: [0x0f,0x16,0xd9]
2077; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm1[0]
2078; X64-SSE-NEXT:    movaps %xmm3, %xmm0 # encoding: [0x0f,0x28,0xc3]
2079; X64-SSE-NEXT:    retq # encoding: [0xc3]
2080;
2081; X64-AVX1-LABEL: test_mm_set_ps:
2082; X64-AVX1:       # %bb.0:
2083; X64-AVX1-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2084; X64-AVX1-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2085; X64-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2086; X64-AVX1-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2087; X64-AVX1-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2088; X64-AVX1-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2089; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2090;
2091; X64-AVX512-LABEL: test_mm_set_ps:
2092; X64-AVX512:       # %bb.0:
2093; X64-AVX512-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2094; X64-AVX512-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2095; X64-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2096; X64-AVX512-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2097; X64-AVX512-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2098; X64-AVX512-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2099; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2100  %res0  = insertelement <4 x float> undef, float %a3, i32 0
2101  %res1  = insertelement <4 x float> %res0, float %a2, i32 1
2102  %res2  = insertelement <4 x float> %res1, float %a1, i32 2
2103  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
2104  ret <4 x float> %res3
2105}
2106
2107define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
2108; X86-SSE-LABEL: test_mm_set_ps1:
2109; X86-SSE:       # %bb.0:
2110; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
2111; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2112; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2113; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2114; X86-SSE-NEXT:    retl # encoding: [0xc3]
2115;
2116; X86-AVX1-LABEL: test_mm_set_ps1:
2117; X86-AVX1:       # %bb.0:
2118; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2119; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2120; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2121; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2122; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2123;
2124; X86-AVX512-LABEL: test_mm_set_ps1:
2125; X86-AVX512:       # %bb.0:
2126; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2127; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2128; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2129; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2130;
2131; X64-SSE-LABEL: test_mm_set_ps1:
2132; X64-SSE:       # %bb.0:
2133; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2134; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2135; X64-SSE-NEXT:    retq # encoding: [0xc3]
2136;
2137; X64-AVX1-LABEL: test_mm_set_ps1:
2138; X64-AVX1:       # %bb.0:
2139; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2140; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2141; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2142;
2143; X64-AVX512-LABEL: test_mm_set_ps1:
2144; X64-AVX512:       # %bb.0:
2145; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2146; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2147  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2148  %res1  = insertelement <4 x float> %res0, float %a0, i32 1
2149  %res2  = insertelement <4 x float> %res1, float %a0, i32 2
2150  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
2151  ret <4 x float> %res3
2152}
2153
2154define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
2155; X86-SSE-LABEL: test_MM_SET_ROUNDING_MODE:
2156; X86-SSE:       # %bb.0:
2157; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
2158; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
2159; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
2160; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
2161; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
2162; X86-SSE-NEXT:    andl $-24577, %edx # encoding: [0x81,0xe2,0xff,0x9f,0xff,0xff]
2163; X86-SSE-NEXT:    # imm = 0x9FFF
2164; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
2165; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
2166; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
2167; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
2168; X86-SSE-NEXT:    retl # encoding: [0xc3]
2169;
2170; X86-AVX-LABEL: test_MM_SET_ROUNDING_MODE:
2171; X86-AVX:       # %bb.0:
2172; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
2173; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
2174; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
2175; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
2176; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
2177; X86-AVX-NEXT:    andl $-24577, %edx # encoding: [0x81,0xe2,0xff,0x9f,0xff,0xff]
2178; X86-AVX-NEXT:    # imm = 0x9FFF
2179; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
2180; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
2181; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
2182; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
2183; X86-AVX-NEXT:    retl # encoding: [0xc3]
2184;
2185; X64-SSE-LABEL: test_MM_SET_ROUNDING_MODE:
2186; X64-SSE:       # %bb.0:
2187; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2188; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
2189; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
2190; X64-SSE-NEXT:    andl $-24577, %ecx # encoding: [0x81,0xe1,0xff,0x9f,0xff,0xff]
2191; X64-SSE-NEXT:    # imm = 0x9FFF
2192; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
2193; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
2194; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
2195; X64-SSE-NEXT:    retq # encoding: [0xc3]
2196;
2197; X64-AVX-LABEL: test_MM_SET_ROUNDING_MODE:
2198; X64-AVX:       # %bb.0:
2199; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2200; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
2201; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
2202; X64-AVX-NEXT:    andl $-24577, %ecx # encoding: [0x81,0xe1,0xff,0x9f,0xff,0xff]
2203; X64-AVX-NEXT:    # imm = 0x9FFF
2204; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
2205; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
2206; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
2207; X64-AVX-NEXT:    retq # encoding: [0xc3]
2208  %1 = alloca i32, align 4
2209  %2 = bitcast i32* %1 to i8*
2210  call void @llvm.x86.sse.stmxcsr(i8* %2)
2211  %3 = load i32, i32* %1
2212  %4 = and i32 %3, -24577
2213  %5 = or i32 %4, %a0
2214  store i32 %5, i32* %1
2215  call void @llvm.x86.sse.ldmxcsr(i8* %2)
2216  ret void
2217}
2218
2219define <4 x float> @test_mm_set_ss(float %a0) nounwind {
2220; X86-SSE-LABEL: test_mm_set_ss:
2221; X86-SSE:       # %bb.0:
2222; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
2223; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
2224; X86-SSE-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
2225; X86-SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
2226; X86-SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
2227; X86-SSE-NEXT:    retl # encoding: [0xc3]
2228;
2229; X86-AVX1-LABEL: test_mm_set_ss:
2230; X86-AVX1:       # %bb.0:
2231; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2232; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2233; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
2234; X86-AVX1-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
2235; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
2236; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2237;
2238; X86-AVX512-LABEL: test_mm_set_ss:
2239; X86-AVX512:       # %bb.0:
2240; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2241; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2242; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
2243; X86-AVX512-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
2244; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
2245; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2246;
2247; X64-SSE-LABEL: test_mm_set_ss:
2248; X64-SSE:       # %bb.0:
2249; X64-SSE-NEXT:    xorps %xmm1, %xmm1 # encoding: [0x0f,0x57,0xc9]
2250; X64-SSE-NEXT:    movss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0x10,0xc8]
2251; X64-SSE-NEXT:    # xmm1 = xmm0[0],xmm1[1,2,3]
2252; X64-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
2253; X64-SSE-NEXT:    retq # encoding: [0xc3]
2254;
2255; X64-AVX-LABEL: test_mm_set_ss:
2256; X64-AVX:       # %bb.0:
2257; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
2258; X64-AVX-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
2259; X64-AVX-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
2260; X64-AVX-NEXT:    retq # encoding: [0xc3]
2261  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2262  %res1  = insertelement <4 x float> %res0, float 0.0, i32 1
2263  %res2  = insertelement <4 x float> %res1, float 0.0, i32 2
2264  %res3  = insertelement <4 x float> %res2, float 0.0, i32 3
2265  ret <4 x float> %res3
2266}
2267
2268define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
2269; X86-SSE-LABEL: test_mm_set1_ps:
2270; X86-SSE:       # %bb.0:
2271; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
2272; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2273; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2274; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2275; X86-SSE-NEXT:    retl # encoding: [0xc3]
2276;
2277; X86-AVX1-LABEL: test_mm_set1_ps:
2278; X86-AVX1:       # %bb.0:
2279; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2280; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2281; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2282; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2283; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2284;
2285; X86-AVX512-LABEL: test_mm_set1_ps:
2286; X86-AVX512:       # %bb.0:
2287; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2288; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2289; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2290; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2291;
2292; X64-SSE-LABEL: test_mm_set1_ps:
2293; X64-SSE:       # %bb.0:
2294; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2295; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2296; X64-SSE-NEXT:    retq # encoding: [0xc3]
2297;
2298; X64-AVX1-LABEL: test_mm_set1_ps:
2299; X64-AVX1:       # %bb.0:
2300; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2301; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2302; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2303;
2304; X64-AVX512-LABEL: test_mm_set1_ps:
2305; X64-AVX512:       # %bb.0:
2306; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2307; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2308  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2309  %res1  = insertelement <4 x float> %res0, float %a0, i32 1
2310  %res2  = insertelement <4 x float> %res1, float %a0, i32 2
2311  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
2312  ret <4 x float> %res3
2313}
2314
2315define void @test_mm_setcsr(i32 %a0) nounwind {
2316; X86-SSE-LABEL: test_mm_setcsr:
2317; X86-SSE:       # %bb.0:
2318; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax # encoding: [0x8d,0x44,0x24,0x04]
2319; X86-SSE-NEXT:    ldmxcsr (%eax) # encoding: [0x0f,0xae,0x10]
2320; X86-SSE-NEXT:    retl # encoding: [0xc3]
2321;
2322; X86-AVX-LABEL: test_mm_setcsr:
2323; X86-AVX:       # %bb.0:
2324; X86-AVX-NEXT:    leal {{[0-9]+}}(%esp), %eax # encoding: [0x8d,0x44,0x24,0x04]
2325; X86-AVX-NEXT:    vldmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x10]
2326; X86-AVX-NEXT:    retl # encoding: [0xc3]
2327;
2328; X64-SSE-LABEL: test_mm_setcsr:
2329; X64-SSE:       # %bb.0:
2330; X64-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x7c,0x24,0xfc]
2331; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2332; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
2333; X64-SSE-NEXT:    retq # encoding: [0xc3]
2334;
2335; X64-AVX-LABEL: test_mm_setcsr:
2336; X64-AVX:       # %bb.0:
2337; X64-AVX-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x7c,0x24,0xfc]
2338; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2339; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
2340; X64-AVX-NEXT:    retq # encoding: [0xc3]
2341  %st = alloca i32, align 4
2342  store i32 %a0, i32* %st, align 4
2343  %bc = bitcast i32* %st to i8*
2344  call void @llvm.x86.sse.ldmxcsr(i8* %bc)
2345  ret void
2346}
2347
2348define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
2349; X86-SSE-LABEL: test_mm_setr_ps:
2350; X86-SSE:       # %bb.0:
2351; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
2352; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2353; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
2354; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
2355; X86-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
2356; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2357; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08]
2358; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
2359; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
2360; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2361; X86-SSE-NEXT:    unpcklps %xmm2, %xmm0 # encoding: [0x0f,0x14,0xc2]
2362; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2363; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
2364; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
2365; X86-SSE-NEXT:    retl # encoding: [0xc3]
2366;
2367; X86-AVX1-LABEL: test_mm_setr_ps:
2368; X86-AVX1:       # %bb.0:
2369; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2370; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2371; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2372; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2373; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
2374; X86-AVX1-NEXT:    # xmm2 = mem[0],zero,zero,zero
2375; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
2376; X86-AVX1-NEXT:    # xmm3 = mem[0],zero,zero,zero
2377; X86-AVX1-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2378; X86-AVX1-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2379; X86-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2380; X86-AVX1-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2381; X86-AVX1-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2382; X86-AVX1-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2383; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2384;
2385; X86-AVX512-LABEL: test_mm_setr_ps:
2386; X86-AVX512:       # %bb.0:
2387; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2388; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2389; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2390; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
2391; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
2392; X86-AVX512-NEXT:    # xmm2 = mem[0],zero,zero,zero
2393; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
2394; X86-AVX512-NEXT:    # xmm3 = mem[0],zero,zero,zero
2395; X86-AVX512-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2396; X86-AVX512-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2397; X86-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2398; X86-AVX512-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2399; X86-AVX512-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2400; X86-AVX512-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2401; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2402;
2403; X64-SSE-LABEL: test_mm_setr_ps:
2404; X64-SSE:       # %bb.0:
2405; X64-SSE-NEXT:    unpcklps %xmm3, %xmm2 # encoding: [0x0f,0x14,0xd3]
2406; X64-SSE-NEXT:    # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2407; X64-SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
2408; X64-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2409; X64-SSE-NEXT:    movlhps %xmm2, %xmm0 # encoding: [0x0f,0x16,0xc2]
2410; X64-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0]
2411; X64-SSE-NEXT:    retq # encoding: [0xc3]
2412;
2413; X64-AVX1-LABEL: test_mm_setr_ps:
2414; X64-AVX1:       # %bb.0:
2415; X64-AVX1-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2416; X64-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2417; X64-AVX1-NEXT:    vinsertps $32, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
2418; X64-AVX1-NEXT:    # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2419; X64-AVX1-NEXT:    vinsertps $48, %xmm3, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
2420; X64-AVX1-NEXT:    # xmm0 = xmm0[0,1,2],xmm3[0]
2421; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2422;
2423; X64-AVX512-LABEL: test_mm_setr_ps:
2424; X64-AVX512:       # %bb.0:
2425; X64-AVX512-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2426; X64-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2427; X64-AVX512-NEXT:    vinsertps $32, %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
2428; X64-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2429; X64-AVX512-NEXT:    vinsertps $48, %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
2430; X64-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm3[0]
2431; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2432  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2433  %res1  = insertelement <4 x float> %res0, float %a1, i32 1
2434  %res2  = insertelement <4 x float> %res1, float %a2, i32 2
2435  %res3  = insertelement <4 x float> %res2, float %a3, i32 3
2436  ret <4 x float> %res3
2437}
2438
2439define <4 x float> @test_mm_setzero_ps() {
2440; SSE-LABEL: test_mm_setzero_ps:
2441; SSE:       # %bb.0:
2442; SSE-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
2443; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2444;
2445; AVX1-LABEL: test_mm_setzero_ps:
2446; AVX1:       # %bb.0:
2447; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
2448; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2449;
2450; AVX512-LABEL: test_mm_setzero_ps:
2451; AVX512:       # %bb.0:
2452; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc0]
2453; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2454  ret <4 x float> zeroinitializer
2455}
2456
2457define void @test_mm_sfence() nounwind {
2458; CHECK-LABEL: test_mm_sfence:
2459; CHECK:       # %bb.0:
2460; CHECK-NEXT:    sfence # encoding: [0x0f,0xae,0xf8]
2461; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2462  call void @llvm.x86.sse.sfence()
2463  ret void
2464}
2465declare void @llvm.x86.sse.sfence() nounwind readnone
2466
2467define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2468; SSE-LABEL: test_mm_shuffle_ps:
2469; SSE:       # %bb.0:
2470; SSE-NEXT:    shufps $0, %xmm1, %xmm0 # encoding: [0x0f,0xc6,0xc1,0x00]
2471; SSE-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
2472; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2473;
2474; AVX1-LABEL: test_mm_shuffle_ps:
2475; AVX1:       # %bb.0:
2476; AVX1-NEXT:    vshufps $0, %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc1,0x00]
2477; AVX1-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
2478; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2479;
2480; AVX512-LABEL: test_mm_shuffle_ps:
2481; AVX512:       # %bb.0:
2482; AVX512-NEXT:    vshufps $0, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc1,0x00]
2483; AVX512-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
2484; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2485  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
2486  ret <4 x float> %res
2487}
2488
2489define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {
2490; SSE-LABEL: test_mm_sqrt_ps:
2491; SSE:       # %bb.0:
2492; SSE-NEXT:    sqrtps %xmm0, %xmm0 # encoding: [0x0f,0x51,0xc0]
2493; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2494;
2495; AVX1-LABEL: test_mm_sqrt_ps:
2496; AVX1:       # %bb.0:
2497; AVX1-NEXT:    vsqrtps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x51,0xc0]
2498; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2499;
2500; AVX512-LABEL: test_mm_sqrt_ps:
2501; AVX512:       # %bb.0:
2502; AVX512-NEXT:    vsqrtps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0]
2503; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2504  %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
2505  ret <4 x float> %res
2506}
2507declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind readnone
2508
2509define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
2510; SSE-LABEL: test_mm_sqrt_ss:
2511; SSE:       # %bb.0:
2512; SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
2513; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2514;
2515; AVX1-LABEL: test_mm_sqrt_ss:
2516; AVX1:       # %bb.0:
2517; AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
2518; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2519;
2520; AVX512-LABEL: test_mm_sqrt_ss:
2521; AVX512:       # %bb.0:
2522; AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
2523; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2524  %ext = extractelement <4 x float> %a0, i32 0
2525  %sqrt = call float @llvm.sqrt.f32(float %ext)
2526  %ins = insertelement <4 x float> %a0, float %sqrt, i32 0
2527  ret <4 x float> %ins
2528}
2529declare float @llvm.sqrt.f32(float) nounwind readnone
2530
2531define float @test_mm_sqrt_ss_scalar(float %a0) {
2532; X86-SSE-LABEL: test_mm_sqrt_ss_scalar:
2533; X86-SSE:       # %bb.0:
2534; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
2535; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
2536; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x08]
2537; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2538; X86-SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
2539; X86-SSE-NEXT:    movss %xmm0, (%esp) # encoding: [0xf3,0x0f,0x11,0x04,0x24]
2540; X86-SSE-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
2541; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
2542; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
2543; X86-SSE-NEXT:    retl # encoding: [0xc3]
2544;
2545; X86-AVX1-LABEL: test_mm_sqrt_ss_scalar:
2546; X86-AVX1:       # %bb.0:
2547; X86-AVX1-NEXT:    pushl %eax # encoding: [0x50]
2548; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
2549; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
2550; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2551; X86-AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
2552; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) # encoding: [0xc5,0xfa,0x11,0x04,0x24]
2553; X86-AVX1-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
2554; X86-AVX1-NEXT:    popl %eax # encoding: [0x58]
2555; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
2556; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2557;
2558; X86-AVX512-LABEL: test_mm_sqrt_ss_scalar:
2559; X86-AVX512:       # %bb.0:
2560; X86-AVX512-NEXT:    pushl %eax # encoding: [0x50]
2561; X86-AVX512-NEXT:    .cfi_def_cfa_offset 8
2562; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
2563; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2564; X86-AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
2565; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
2566; X86-AVX512-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
2567; X86-AVX512-NEXT:    popl %eax # encoding: [0x58]
2568; X86-AVX512-NEXT:    .cfi_def_cfa_offset 4
2569; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2570;
2571; X64-SSE-LABEL: test_mm_sqrt_ss_scalar:
2572; X64-SSE:       # %bb.0:
2573; X64-SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
2574; X64-SSE-NEXT:    retq # encoding: [0xc3]
2575;
2576; X64-AVX1-LABEL: test_mm_sqrt_ss_scalar:
2577; X64-AVX1:       # %bb.0:
2578; X64-AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
2579; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2580;
2581; X64-AVX512-LABEL: test_mm_sqrt_ss_scalar:
2582; X64-AVX512:       # %bb.0:
2583; X64-AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
2584; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2585  %sqrt = call float @llvm.sqrt.f32(float %a0)
2586  ret float %sqrt
2587}
2588
2589define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
2590; X86-SSE-LABEL: test_mm_store_ps:
2591; X86-SSE:       # %bb.0:
2592; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2593; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2594; X86-SSE-NEXT:    retl # encoding: [0xc3]
2595;
2596; X86-AVX1-LABEL: test_mm_store_ps:
2597; X86-AVX1:       # %bb.0:
2598; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2599; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2600; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2601;
2602; X86-AVX512-LABEL: test_mm_store_ps:
2603; X86-AVX512:       # %bb.0:
2604; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2605; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2606; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2607;
2608; X64-SSE-LABEL: test_mm_store_ps:
2609; X64-SSE:       # %bb.0:
2610; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2611; X64-SSE-NEXT:    retq # encoding: [0xc3]
2612;
2613; X64-AVX1-LABEL: test_mm_store_ps:
2614; X64-AVX1:       # %bb.0:
2615; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2616; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2617;
2618; X64-AVX512-LABEL: test_mm_store_ps:
2619; X64-AVX512:       # %bb.0:
2620; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2621; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2622  %arg0 = bitcast float* %a0 to <4 x float>*
2623  store <4 x float> %a1, <4 x float>* %arg0, align 16
2624  ret void
2625}
2626
2627define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
2628; X86-SSE-LABEL: test_mm_store_ps1:
2629; X86-SSE:       # %bb.0:
2630; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2631; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2632; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2633; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2634; X86-SSE-NEXT:    retl # encoding: [0xc3]
2635;
2636; X86-AVX1-LABEL: test_mm_store_ps1:
2637; X86-AVX1:       # %bb.0:
2638; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2639; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2640; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2641; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2642; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2643;
2644; X86-AVX512-LABEL: test_mm_store_ps1:
2645; X86-AVX512:       # %bb.0:
2646; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2647; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2648; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2649; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2650;
2651; X64-SSE-LABEL: test_mm_store_ps1:
2652; X64-SSE:       # %bb.0:
2653; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2654; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2655; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2656; X64-SSE-NEXT:    retq # encoding: [0xc3]
2657;
2658; X64-AVX1-LABEL: test_mm_store_ps1:
2659; X64-AVX1:       # %bb.0:
2660; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2661; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2662; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2663; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2664;
2665; X64-AVX512-LABEL: test_mm_store_ps1:
2666; X64-AVX512:       # %bb.0:
2667; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2668; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2669; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2670  %arg0 = bitcast float* %a0 to <4 x float>*
2671  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
2672  store <4 x float> %shuf, <4 x float>* %arg0, align 16
2673  ret void
2674}
2675
2676define void @test_mm_store_ss(float *%a0, <4 x float> %a1) {
2677; X86-SSE-LABEL: test_mm_store_ss:
2678; X86-SSE:       # %bb.0:
2679; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2680; X86-SSE-NEXT:    movss %xmm0, (%eax) # encoding: [0xf3,0x0f,0x11,0x00]
2681; X86-SSE-NEXT:    retl # encoding: [0xc3]
2682;
2683; X86-AVX1-LABEL: test_mm_store_ss:
2684; X86-AVX1:       # %bb.0:
2685; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2686; X86-AVX1-NEXT:    vmovss %xmm0, (%eax) # encoding: [0xc5,0xfa,0x11,0x00]
2687; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2688;
2689; X86-AVX512-LABEL: test_mm_store_ss:
2690; X86-AVX512:       # %bb.0:
2691; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2692; X86-AVX512-NEXT:    vmovss %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x00]
2693; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2694;
2695; X64-SSE-LABEL: test_mm_store_ss:
2696; X64-SSE:       # %bb.0:
2697; X64-SSE-NEXT:    movss %xmm0, (%rdi) # encoding: [0xf3,0x0f,0x11,0x07]
2698; X64-SSE-NEXT:    retq # encoding: [0xc3]
2699;
2700; X64-AVX1-LABEL: test_mm_store_ss:
2701; X64-AVX1:       # %bb.0:
2702; X64-AVX1-NEXT:    vmovss %xmm0, (%rdi) # encoding: [0xc5,0xfa,0x11,0x07]
2703; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2704;
2705; X64-AVX512-LABEL: test_mm_store_ss:
2706; X64-AVX512:       # %bb.0:
2707; X64-AVX512-NEXT:    vmovss %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07]
2708; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2709  %ext = extractelement <4 x float> %a1, i32 0
2710  store float %ext, float* %a0, align 1
2711  ret void
2712}
2713
2714define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
2715; X86-SSE-LABEL: test_mm_store1_ps:
2716; X86-SSE:       # %bb.0:
2717; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2718; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2719; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2720; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2721; X86-SSE-NEXT:    retl # encoding: [0xc3]
2722;
2723; X86-AVX1-LABEL: test_mm_store1_ps:
2724; X86-AVX1:       # %bb.0:
2725; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2726; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2727; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2728; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2729; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2730;
2731; X86-AVX512-LABEL: test_mm_store1_ps:
2732; X86-AVX512:       # %bb.0:
2733; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2734; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2735; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2736; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2737;
2738; X64-SSE-LABEL: test_mm_store1_ps:
2739; X64-SSE:       # %bb.0:
2740; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2741; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2742; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2743; X64-SSE-NEXT:    retq # encoding: [0xc3]
2744;
2745; X64-AVX1-LABEL: test_mm_store1_ps:
2746; X64-AVX1:       # %bb.0:
2747; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2748; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2749; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2750; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2751;
2752; X64-AVX512-LABEL: test_mm_store1_ps:
2753; X64-AVX512:       # %bb.0:
2754; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2755; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2756; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2757  %arg0 = bitcast float* %a0 to <4 x float>*
2758  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
2759  store <4 x float> %shuf, <4 x float>* %arg0, align 16
2760  ret void
2761}
2762
2763define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
2764; X86-SSE-LABEL: test_mm_storeh_ps:
2765; X86-SSE:       # %bb.0:
2766; X86-SSE-NEXT:    pushl %ebp # encoding: [0x55]
2767; X86-SSE-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
2768; X86-SSE-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
2769; X86-SSE-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
2770; X86-SSE-NEXT:    movl 8(%ebp), %eax # encoding: [0x8b,0x45,0x08]
2771; X86-SSE-NEXT:    movaps %xmm0, (%esp) # encoding: [0x0f,0x29,0x04,0x24]
2772; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
2773; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
2774; X86-SSE-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
2775; X86-SSE-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
2776; X86-SSE-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
2777; X86-SSE-NEXT:    popl %ebp # encoding: [0x5d]
2778; X86-SSE-NEXT:    retl # encoding: [0xc3]
2779;
2780; X86-AVX1-LABEL: test_mm_storeh_ps:
2781; X86-AVX1:       # %bb.0:
2782; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2783; X86-AVX1-NEXT:    vmovhpd %xmm0, (%eax) # encoding: [0xc5,0xf9,0x17,0x00]
2784; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2785;
2786; X86-AVX512-LABEL: test_mm_storeh_ps:
2787; X86-AVX512:       # %bb.0:
2788; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2789; X86-AVX512-NEXT:    vmovhpd %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x17,0x00]
2790; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2791;
2792; X64-SSE-LABEL: test_mm_storeh_ps:
2793; X64-SSE:       # %bb.0:
2794; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
2795; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xf0]
2796; X64-SSE-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2797; X64-SSE-NEXT:    retq # encoding: [0xc3]
2798;
2799; X64-AVX1-LABEL: test_mm_storeh_ps:
2800; X64-AVX1:       # %bb.0:
2801; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax # encoding: [0xc4,0xe3,0xf9,0x16,0xc0,0x01]
2802; X64-AVX1-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2803; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2804;
2805; X64-AVX512-LABEL: test_mm_storeh_ps:
2806; X64-AVX512:       # %bb.0:
2807; X64-AVX512-NEXT:    vpextrq $1, %xmm0, %rax # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0x16,0xc0,0x01]
2808; X64-AVX512-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2809; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2810  %ptr = bitcast x86_mmx* %a0 to i64*
2811  %bc  = bitcast <4 x float> %a1 to <2 x i64>
2812  %ext = extractelement <2 x i64> %bc, i32 1
2813  store i64 %ext, i64* %ptr
2814  ret void
2815}
2816
2817define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
2818; X86-SSE-LABEL: test_mm_storel_ps:
2819; X86-SSE:       # %bb.0:
2820; X86-SSE-NEXT:    pushl %ebp # encoding: [0x55]
2821; X86-SSE-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
2822; X86-SSE-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
2823; X86-SSE-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
2824; X86-SSE-NEXT:    movl 8(%ebp), %eax # encoding: [0x8b,0x45,0x08]
2825; X86-SSE-NEXT:    movaps %xmm0, (%esp) # encoding: [0x0f,0x29,0x04,0x24]
2826; X86-SSE-NEXT:    movl (%esp), %ecx # encoding: [0x8b,0x0c,0x24]
2827; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
2828; X86-SSE-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
2829; X86-SSE-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
2830; X86-SSE-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
2831; X86-SSE-NEXT:    popl %ebp # encoding: [0x5d]
2832; X86-SSE-NEXT:    retl # encoding: [0xc3]
2833;
2834; X86-AVX1-LABEL: test_mm_storel_ps:
2835; X86-AVX1:       # %bb.0:
2836; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2837; X86-AVX1-NEXT:    vmovlps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x13,0x00]
2838; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2839;
2840; X86-AVX512-LABEL: test_mm_storel_ps:
2841; X86-AVX512:       # %bb.0:
2842; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2843; X86-AVX512-NEXT:    vmovlps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00]
2844; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2845;
2846; X64-SSE-LABEL: test_mm_storel_ps:
2847; X64-SSE:       # %bb.0:
2848; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
2849; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xe8]
2850; X64-SSE-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2851; X64-SSE-NEXT:    retq # encoding: [0xc3]
2852;
2853; X64-AVX1-LABEL: test_mm_storel_ps:
2854; X64-AVX1:       # %bb.0:
2855; X64-AVX1-NEXT:    vmovq %xmm0, %rax # encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
2856; X64-AVX1-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2857; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2858;
2859; X64-AVX512-LABEL: test_mm_storel_ps:
2860; X64-AVX512:       # %bb.0:
2861; X64-AVX512-NEXT:    vmovq %xmm0, %rax # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
2862; X64-AVX512-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2863; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2864  %ptr = bitcast x86_mmx* %a0 to i64*
2865  %bc  = bitcast <4 x float> %a1 to <2 x i64>
2866  %ext = extractelement <2 x i64> %bc, i32 0
2867  store i64 %ext, i64* %ptr
2868  ret void
2869}
2870
2871define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
2872; X86-SSE-LABEL: test_mm_storer_ps:
2873; X86-SSE:       # %bb.0:
2874; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2875; X86-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
2876; X86-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
2877; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2878; X86-SSE-NEXT:    retl # encoding: [0xc3]
2879;
2880; X86-AVX1-LABEL: test_mm_storer_ps:
2881; X86-AVX1:       # %bb.0:
2882; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2883; X86-AVX1-NEXT:    vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2884; X86-AVX1-NEXT:    # xmm0 = xmm0[3,2,1,0]
2885; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2886; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2887;
2888; X86-AVX512-LABEL: test_mm_storer_ps:
2889; X86-AVX512:       # %bb.0:
2890; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2891; X86-AVX512-NEXT:    vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2892; X86-AVX512-NEXT:    # xmm0 = xmm0[3,2,1,0]
2893; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2894; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2895;
2896; X64-SSE-LABEL: test_mm_storer_ps:
2897; X64-SSE:       # %bb.0:
2898; X64-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
2899; X64-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
2900; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2901; X64-SSE-NEXT:    retq # encoding: [0xc3]
2902;
2903; X64-AVX1-LABEL: test_mm_storer_ps:
2904; X64-AVX1:       # %bb.0:
2905; X64-AVX1-NEXT:    vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2906; X64-AVX1-NEXT:    # xmm0 = xmm0[3,2,1,0]
2907; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2908; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2909;
2910; X64-AVX512-LABEL: test_mm_storer_ps:
2911; X64-AVX512:       # %bb.0:
2912; X64-AVX512-NEXT:    vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2913; X64-AVX512-NEXT:    # xmm0 = xmm0[3,2,1,0]
2914; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2915; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2916  %arg0 = bitcast float* %a0 to <4 x float>*
2917  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2918  store <4 x float> %shuf, <4 x float>* %arg0, align 16
2919  ret void
2920}
2921
2922define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) {
2923; X86-SSE-LABEL: test_mm_storeu_ps:
2924; X86-SSE:       # %bb.0:
2925; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2926; X86-SSE-NEXT:    movups %xmm0, (%eax) # encoding: [0x0f,0x11,0x00]
2927; X86-SSE-NEXT:    retl # encoding: [0xc3]
2928;
2929; X86-AVX1-LABEL: test_mm_storeu_ps:
2930; X86-AVX1:       # %bb.0:
2931; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2932; X86-AVX1-NEXT:    vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00]
2933; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2934;
2935; X86-AVX512-LABEL: test_mm_storeu_ps:
2936; X86-AVX512:       # %bb.0:
2937; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2938; X86-AVX512-NEXT:    vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00]
2939; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2940;
2941; X64-SSE-LABEL: test_mm_storeu_ps:
2942; X64-SSE:       # %bb.0:
2943; X64-SSE-NEXT:    movups %xmm0, (%rdi) # encoding: [0x0f,0x11,0x07]
2944; X64-SSE-NEXT:    retq # encoding: [0xc3]
2945;
2946; X64-AVX1-LABEL: test_mm_storeu_ps:
2947; X64-AVX1:       # %bb.0:
2948; X64-AVX1-NEXT:    vmovups %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x11,0x07]
2949; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2950;
2951; X64-AVX512-LABEL: test_mm_storeu_ps:
2952; X64-AVX512:       # %bb.0:
2953; X64-AVX512-NEXT:    vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
2954; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2955  %arg0 = bitcast float* %a0 to <4 x float>*
2956  store <4 x float> %a1, <4 x float>* %arg0, align 1
2957  ret void
2958}
2959
2960define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) {
2961; X86-SSE-LABEL: test_mm_stream_ps:
2962; X86-SSE:       # %bb.0:
2963; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2964; X86-SSE-NEXT:    movntps %xmm0, (%eax) # encoding: [0x0f,0x2b,0x00]
2965; X86-SSE-NEXT:    retl # encoding: [0xc3]
2966;
2967; X86-AVX1-LABEL: test_mm_stream_ps:
2968; X86-AVX1:       # %bb.0:
2969; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2970; X86-AVX1-NEXT:    vmovntps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x2b,0x00]
2971; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2972;
2973; X86-AVX512-LABEL: test_mm_stream_ps:
2974; X86-AVX512:       # %bb.0:
2975; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2976; X86-AVX512-NEXT:    vmovntps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x00]
2977; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2978;
2979; X64-SSE-LABEL: test_mm_stream_ps:
2980; X64-SSE:       # %bb.0:
2981; X64-SSE-NEXT:    movntps %xmm0, (%rdi) # encoding: [0x0f,0x2b,0x07]
2982; X64-SSE-NEXT:    retq # encoding: [0xc3]
2983;
2984; X64-AVX1-LABEL: test_mm_stream_ps:
2985; X64-AVX1:       # %bb.0:
2986; X64-AVX1-NEXT:    vmovntps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x2b,0x07]
2987; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2988;
2989; X64-AVX512-LABEL: test_mm_stream_ps:
2990; X64-AVX512:       # %bb.0:
2991; X64-AVX512-NEXT:    vmovntps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x07]
2992; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2993  %arg0 = bitcast float* %a0 to <4 x float>*
2994  store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0
2995  ret void
2996}
2997
2998define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2999; SSE-LABEL: test_mm_sub_ps:
3000; SSE:       # %bb.0:
3001; SSE-NEXT:    subps %xmm1, %xmm0 # encoding: [0x0f,0x5c,0xc1]
3002; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3003;
3004; AVX1-LABEL: test_mm_sub_ps:
3005; AVX1:       # %bb.0:
3006; AVX1-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5c,0xc1]
3007; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3008;
3009; AVX512-LABEL: test_mm_sub_ps:
3010; AVX512:       # %bb.0:
3011; AVX512-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5c,0xc1]
3012; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3013  %res = fsub <4 x float> %a0, %a1
3014  ret <4 x float> %res
3015}
3016
3017define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3018; SSE-LABEL: test_mm_sub_ss:
3019; SSE:       # %bb.0:
3020; SSE-NEXT:    subss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5c,0xc1]
3021; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3022;
3023; AVX1-LABEL: test_mm_sub_ss:
3024; AVX1:       # %bb.0:
3025; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5c,0xc1]
3026; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3027;
3028; AVX512-LABEL: test_mm_sub_ss:
3029; AVX512:       # %bb.0:
3030; AVX512-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5c,0xc1]
3031; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3032  %ext0 = extractelement <4 x float> %a0, i32 0
3033  %ext1 = extractelement <4 x float> %a1, i32 0
3034  %fsub = fsub float %ext0, %ext1
3035  %res = insertelement <4 x float> %a0, float %fsub, i32 0
3036  ret <4 x float> %res
3037}
3038
3039define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind {
3040; X86-SSE-LABEL: test_MM_TRANSPOSE4_PS:
3041; X86-SSE:       # %bb.0:
3042; X86-SSE-NEXT:    pushl %esi # encoding: [0x56]
3043; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
3044; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
3045; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
3046; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
3047; X86-SSE-NEXT:    movaps (%esi), %xmm0 # encoding: [0x0f,0x28,0x06]
3048; X86-SSE-NEXT:    movaps (%edx), %xmm1 # encoding: [0x0f,0x28,0x0a]
3049; X86-SSE-NEXT:    movaps (%ecx), %xmm2 # encoding: [0x0f,0x28,0x11]
3050; X86-SSE-NEXT:    movaps (%eax), %xmm3 # encoding: [0x0f,0x28,0x18]
3051; X86-SSE-NEXT:    movaps %xmm0, %xmm4 # encoding: [0x0f,0x28,0xe0]
3052; X86-SSE-NEXT:    unpcklps %xmm1, %xmm4 # encoding: [0x0f,0x14,0xe1]
3053; X86-SSE-NEXT:    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3054; X86-SSE-NEXT:    movaps %xmm2, %xmm5 # encoding: [0x0f,0x28,0xea]
3055; X86-SSE-NEXT:    unpcklps %xmm3, %xmm5 # encoding: [0x0f,0x14,0xeb]
3056; X86-SSE-NEXT:    # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
3057; X86-SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
3058; X86-SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3059; X86-SSE-NEXT:    unpckhps %xmm3, %xmm2 # encoding: [0x0f,0x15,0xd3]
3060; X86-SSE-NEXT:    # xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3061; X86-SSE-NEXT:    movaps %xmm4, %xmm1 # encoding: [0x0f,0x28,0xcc]
3062; X86-SSE-NEXT:    movlhps %xmm5, %xmm1 # encoding: [0x0f,0x16,0xcd]
3063; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm5[0]
3064; X86-SSE-NEXT:    movhlps %xmm4, %xmm5 # encoding: [0x0f,0x12,0xec]
3065; X86-SSE-NEXT:    # xmm5 = xmm4[1],xmm5[1]
3066; X86-SSE-NEXT:    movaps %xmm0, %xmm3 # encoding: [0x0f,0x28,0xd8]
3067; X86-SSE-NEXT:    movlhps %xmm2, %xmm3 # encoding: [0x0f,0x16,0xda]
3068; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0]
3069; X86-SSE-NEXT:    movhlps %xmm0, %xmm2 # encoding: [0x0f,0x12,0xd0]
3070; X86-SSE-NEXT:    # xmm2 = xmm0[1],xmm2[1]
3071; X86-SSE-NEXT:    movaps %xmm1, (%esi) # encoding: [0x0f,0x29,0x0e]
3072; X86-SSE-NEXT:    movaps %xmm5, (%edx) # encoding: [0x0f,0x29,0x2a]
3073; X86-SSE-NEXT:    movaps %xmm3, (%ecx) # encoding: [0x0f,0x29,0x19]
3074; X86-SSE-NEXT:    movaps %xmm2, (%eax) # encoding: [0x0f,0x29,0x10]
3075; X86-SSE-NEXT:    popl %esi # encoding: [0x5e]
3076; X86-SSE-NEXT:    retl # encoding: [0xc3]
3077;
3078; X86-AVX1-LABEL: test_MM_TRANSPOSE4_PS:
3079; X86-AVX1:       # %bb.0:
3080; X86-AVX1-NEXT:    pushl %esi # encoding: [0x56]
3081; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
3082; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
3083; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
3084; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
3085; X86-AVX1-NEXT:    vmovaps (%esi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x06]
3086; X86-AVX1-NEXT:    vmovaps (%edx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
3087; X86-AVX1-NEXT:    vmovaps (%ecx), %xmm2 # encoding: [0xc5,0xf8,0x28,0x11]
3088; X86-AVX1-NEXT:    vmovaps (%eax), %xmm3 # encoding: [0xc5,0xf8,0x28,0x18]
3089; X86-AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x14,0xe1]
3090; X86-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3091; X86-AVX1-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # encoding: [0xc5,0xe8,0x14,0xeb]
3092; X86-AVX1-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3093; X86-AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
3094; X86-AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3095; X86-AVX1-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x15,0xcb]
3096; X86-AVX1-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3097; X86-AVX1-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # encoding: [0xc5,0xd8,0x16,0xd5]
3098; X86-AVX1-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3099; X86-AVX1-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # encoding: [0xc5,0xd9,0x15,0xdd]
3100; X86-AVX1-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3101; X86-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x16,0xe1]
3102; X86-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3103; X86-AVX1-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x15,0xc1]
3104; X86-AVX1-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3105; X86-AVX1-NEXT:    vmovaps %xmm2, (%esi) # encoding: [0xc5,0xf8,0x29,0x16]
3106; X86-AVX1-NEXT:    vmovaps %xmm3, (%edx) # encoding: [0xc5,0xf8,0x29,0x1a]
3107; X86-AVX1-NEXT:    vmovaps %xmm4, (%ecx) # encoding: [0xc5,0xf8,0x29,0x21]
3108; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
3109; X86-AVX1-NEXT:    popl %esi # encoding: [0x5e]
3110; X86-AVX1-NEXT:    retl # encoding: [0xc3]
3111;
3112; X86-AVX512-LABEL: test_MM_TRANSPOSE4_PS:
3113; X86-AVX512:       # %bb.0:
3114; X86-AVX512-NEXT:    pushl %esi # encoding: [0x56]
3115; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
3116; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
3117; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
3118; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
3119; X86-AVX512-NEXT:    vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06]
3120; X86-AVX512-NEXT:    vmovaps (%edx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0a]
3121; X86-AVX512-NEXT:    vmovaps (%ecx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x11]
3122; X86-AVX512-NEXT:    vmovaps (%eax), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x18]
3123; X86-AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1]
3124; X86-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3125; X86-AVX512-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb]
3126; X86-AVX512-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3127; X86-AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
3128; X86-AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3129; X86-AVX512-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x15,0xcb]
3130; X86-AVX512-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3131; X86-AVX512-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x16,0xd5]
3132; X86-AVX512-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3133; X86-AVX512-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x15,0xdd]
3134; X86-AVX512-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3135; X86-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xe1]
3136; X86-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3137; X86-AVX512-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1]
3138; X86-AVX512-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3139; X86-AVX512-NEXT:    vmovaps %xmm2, (%esi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x16]
3140; X86-AVX512-NEXT:    vmovaps %xmm3, (%edx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1a]
3141; X86-AVX512-NEXT:    vmovaps %xmm4, (%ecx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x21]
3142; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
3143; X86-AVX512-NEXT:    popl %esi # encoding: [0x5e]
3144; X86-AVX512-NEXT:    retl # encoding: [0xc3]
3145;
3146; X64-SSE-LABEL: test_MM_TRANSPOSE4_PS:
3147; X64-SSE:       # %bb.0:
3148; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
3149; X64-SSE-NEXT:    movaps (%rsi), %xmm1 # encoding: [0x0f,0x28,0x0e]
3150; X64-SSE-NEXT:    movaps (%rdx), %xmm2 # encoding: [0x0f,0x28,0x12]
3151; X64-SSE-NEXT:    movaps (%rcx), %xmm3 # encoding: [0x0f,0x28,0x19]
3152; X64-SSE-NEXT:    movaps %xmm0, %xmm4 # encoding: [0x0f,0x28,0xe0]
3153; X64-SSE-NEXT:    unpcklps %xmm1, %xmm4 # encoding: [0x0f,0x14,0xe1]
3154; X64-SSE-NEXT:    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3155; X64-SSE-NEXT:    movaps %xmm2, %xmm5 # encoding: [0x0f,0x28,0xea]
3156; X64-SSE-NEXT:    unpcklps %xmm3, %xmm5 # encoding: [0x0f,0x14,0xeb]
3157; X64-SSE-NEXT:    # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
3158; X64-SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
3159; X64-SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3160; X64-SSE-NEXT:    unpckhps %xmm3, %xmm2 # encoding: [0x0f,0x15,0xd3]
3161; X64-SSE-NEXT:    # xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3162; X64-SSE-NEXT:    movaps %xmm4, %xmm1 # encoding: [0x0f,0x28,0xcc]
3163; X64-SSE-NEXT:    movlhps %xmm5, %xmm1 # encoding: [0x0f,0x16,0xcd]
3164; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm5[0]
3165; X64-SSE-NEXT:    movhlps %xmm4, %xmm5 # encoding: [0x0f,0x12,0xec]
3166; X64-SSE-NEXT:    # xmm5 = xmm4[1],xmm5[1]
3167; X64-SSE-NEXT:    movaps %xmm0, %xmm3 # encoding: [0x0f,0x28,0xd8]
3168; X64-SSE-NEXT:    movlhps %xmm2, %xmm3 # encoding: [0x0f,0x16,0xda]
3169; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0]
3170; X64-SSE-NEXT:    movhlps %xmm0, %xmm2 # encoding: [0x0f,0x12,0xd0]
3171; X64-SSE-NEXT:    # xmm2 = xmm0[1],xmm2[1]
3172; X64-SSE-NEXT:    movaps %xmm1, (%rdi) # encoding: [0x0f,0x29,0x0f]
3173; X64-SSE-NEXT:    movaps %xmm5, (%rsi) # encoding: [0x0f,0x29,0x2e]
3174; X64-SSE-NEXT:    movaps %xmm3, (%rdx) # encoding: [0x0f,0x29,0x1a]
3175; X64-SSE-NEXT:    movaps %xmm2, (%rcx) # encoding: [0x0f,0x29,0x11]
3176; X64-SSE-NEXT:    retq # encoding: [0xc3]
3177;
3178; X64-AVX1-LABEL: test_MM_TRANSPOSE4_PS:
3179; X64-AVX1:       # %bb.0:
3180; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07]
3181; X64-AVX1-NEXT:    vmovaps (%rsi), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0e]
3182; X64-AVX1-NEXT:    vmovaps (%rdx), %xmm2 # encoding: [0xc5,0xf8,0x28,0x12]
3183; X64-AVX1-NEXT:    vmovaps (%rcx), %xmm3 # encoding: [0xc5,0xf8,0x28,0x19]
3184; X64-AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x14,0xe1]
3185; X64-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3186; X64-AVX1-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # encoding: [0xc5,0xe8,0x14,0xeb]
3187; X64-AVX1-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3188; X64-AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
3189; X64-AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3190; X64-AVX1-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x15,0xcb]
3191; X64-AVX1-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3192; X64-AVX1-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # encoding: [0xc5,0xd8,0x16,0xd5]
3193; X64-AVX1-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3194; X64-AVX1-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # encoding: [0xc5,0xd9,0x15,0xdd]
3195; X64-AVX1-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3196; X64-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x16,0xe1]
3197; X64-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3198; X64-AVX1-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x15,0xc1]
3199; X64-AVX1-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3200; X64-AVX1-NEXT:    vmovaps %xmm2, (%rdi) # encoding: [0xc5,0xf8,0x29,0x17]
3201; X64-AVX1-NEXT:    vmovaps %xmm3, (%rsi) # encoding: [0xc5,0xf8,0x29,0x1e]
3202; X64-AVX1-NEXT:    vmovaps %xmm4, (%rdx) # encoding: [0xc5,0xf8,0x29,0x22]
3203; X64-AVX1-NEXT:    vmovaps %xmm0, (%rcx) # encoding: [0xc5,0xf8,0x29,0x01]
3204; X64-AVX1-NEXT:    retq # encoding: [0xc3]
3205;
3206; X64-AVX512-LABEL: test_MM_TRANSPOSE4_PS:
3207; X64-AVX512:       # %bb.0:
3208; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
3209; X64-AVX512-NEXT:    vmovaps (%rsi), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0e]
3210; X64-AVX512-NEXT:    vmovaps (%rdx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x12]
3211; X64-AVX512-NEXT:    vmovaps (%rcx), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x19]
3212; X64-AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1]
3213; X64-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3214; X64-AVX512-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb]
3215; X64-AVX512-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3216; X64-AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
3217; X64-AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3218; X64-AVX512-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x15,0xcb]
3219; X64-AVX512-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3220; X64-AVX512-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x16,0xd5]
3221; X64-AVX512-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3222; X64-AVX512-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x15,0xdd]
3223; X64-AVX512-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3224; X64-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xe1]
3225; X64-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3226; X64-AVX512-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1]
3227; X64-AVX512-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3228; X64-AVX512-NEXT:    vmovaps %xmm2, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x17]
3229; X64-AVX512-NEXT:    vmovaps %xmm3, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1e]
3230; X64-AVX512-NEXT:    vmovaps %xmm4, (%rdx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x22]
3231; X64-AVX512-NEXT:    vmovaps %xmm0, (%rcx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x01]
3232; X64-AVX512-NEXT:    retq # encoding: [0xc3]
3233  %row0 = load <4 x float>, <4 x float>* %a0, align 16
3234  %row1 = load <4 x float>, <4 x float>* %a1, align 16
3235  %row2 = load <4 x float>, <4 x float>* %a2, align 16
3236  %row3 = load <4 x float>, <4 x float>* %a3, align 16
3237  %tmp0 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3238  %tmp2 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3239  %tmp1 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3240  %tmp3 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3241  %res0 = shufflevector <4 x float> %tmp0, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
3242  %res1 = shufflevector <4 x float> %tmp2, <4 x float> %tmp0, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
3243  %res2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
3244  %res3 = shufflevector <4 x float> %tmp3, <4 x float> %tmp1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
3245  store <4 x float> %res0, <4 x float>* %a0, align 16
3246  store <4 x float> %res1, <4 x float>* %a1, align 16
3247  store <4 x float> %res2, <4 x float>* %a2, align 16
3248  store <4 x float> %res3, <4 x float>* %a3, align 16
3249  ret void
3250}
3251
3252define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3253; SSE-LABEL: test_mm_ucomieq_ss:
3254; SSE:       # %bb.0:
3255; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3256; SSE-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
3257; SSE-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
3258; SSE-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
3259; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3260; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3261;
3262; AVX1-LABEL: test_mm_ucomieq_ss:
3263; AVX1:       # %bb.0:
3264; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3265; AVX1-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
3266; AVX1-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
3267; AVX1-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
3268; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3269; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3270;
3271; AVX512-LABEL: test_mm_ucomieq_ss:
3272; AVX512:       # %bb.0:
3273; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3274; AVX512-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
3275; AVX512-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
3276; AVX512-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
3277; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3278; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3279  %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
3280  ret i32 %res
3281}
3282declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
3283
3284define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3285; SSE-LABEL: test_mm_ucomige_ss:
3286; SSE:       # %bb.0:
3287; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3288; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3289; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3290; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3291;
3292; AVX1-LABEL: test_mm_ucomige_ss:
3293; AVX1:       # %bb.0:
3294; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3295; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3296; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3297; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3298;
3299; AVX512-LABEL: test_mm_ucomige_ss:
3300; AVX512:       # %bb.0:
3301; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3302; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3303; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3304; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3305  %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1)
3306  ret i32 %res
3307}
3308declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
3309
3310define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3311; SSE-LABEL: test_mm_ucomigt_ss:
3312; SSE:       # %bb.0:
3313; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3314; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3315; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3316; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3317;
3318; AVX1-LABEL: test_mm_ucomigt_ss:
3319; AVX1:       # %bb.0:
3320; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3321; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3322; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3323; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3324;
3325; AVX512-LABEL: test_mm_ucomigt_ss:
3326; AVX512:       # %bb.0:
3327; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3328; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3329; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3330; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3331  %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1)
3332  ret i32 %res
3333}
3334declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
3335
3336define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3337; SSE-LABEL: test_mm_ucomile_ss:
3338; SSE:       # %bb.0:
3339; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3340; SSE-NEXT:    ucomiss %xmm0, %xmm1 # encoding: [0x0f,0x2e,0xc8]
3341; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3342; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3343;
3344; AVX1-LABEL: test_mm_ucomile_ss:
3345; AVX1:       # %bb.0:
3346; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3347; AVX1-NEXT:    vucomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2e,0xc8]
3348; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3349; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3350;
3351; AVX512-LABEL: test_mm_ucomile_ss:
3352; AVX512:       # %bb.0:
3353; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3354; AVX512-NEXT:    vucomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
3355; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3356; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3357  %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1)
3358  ret i32 %res
3359}
3360declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
3361
3362define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3363; SSE-LABEL: test_mm_ucomilt_ss:
3364; SSE:       # %bb.0:
3365; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3366; SSE-NEXT:    ucomiss %xmm0, %xmm1 # encoding: [0x0f,0x2e,0xc8]
3367; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3368; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3369;
3370; AVX1-LABEL: test_mm_ucomilt_ss:
3371; AVX1:       # %bb.0:
3372; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3373; AVX1-NEXT:    vucomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2e,0xc8]
3374; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3375; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3376;
3377; AVX512-LABEL: test_mm_ucomilt_ss:
3378; AVX512:       # %bb.0:
3379; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3380; AVX512-NEXT:    vucomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
3381; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3382; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3383  %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1)
3384  ret i32 %res
3385}
3386declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
3387
3388define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3389; SSE-LABEL: test_mm_ucomineq_ss:
3390; SSE:       # %bb.0:
3391; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3392; SSE-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
3393; SSE-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
3394; SSE-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
3395; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3396; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3397;
3398; AVX1-LABEL: test_mm_ucomineq_ss:
3399; AVX1:       # %bb.0:
3400; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3401; AVX1-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
3402; AVX1-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
3403; AVX1-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
3404; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3405; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3406;
3407; AVX512-LABEL: test_mm_ucomineq_ss:
3408; AVX512:       # %bb.0:
3409; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3410; AVX512-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
3411; AVX512-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
3412; AVX512-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
3413; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3414; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3415  %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1)
3416  ret i32 %res
3417}
3418declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
3419
3420define <4 x float> @test_mm_undefined_ps() {
3421; CHECK-LABEL: test_mm_undefined_ps:
3422; CHECK:       # %bb.0:
3423; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3424  ret <4 x float> undef
3425}
3426
3427define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3428; SSE-LABEL: test_mm_unpackhi_ps:
3429; SSE:       # %bb.0:
3430; SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
3431; SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3432; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3433;
3434; AVX1-LABEL: test_mm_unpackhi_ps:
3435; AVX1:       # %bb.0:
3436; AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
3437; AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3438; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3439;
3440; AVX512-LABEL: test_mm_unpackhi_ps:
3441; AVX512:       # %bb.0:
3442; AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
3443; AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3444; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3445  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3446  ret <4 x float> %res
3447}
3448
3449define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3450; SSE-LABEL: test_mm_unpacklo_ps:
3451; SSE:       # %bb.0:
3452; SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
3453; SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3454; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3455;
3456; AVX1-LABEL: test_mm_unpacklo_ps:
3457; AVX1:       # %bb.0:
3458; AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x14,0xc1]
3459; AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3460; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3461;
3462; AVX512-LABEL: test_mm_unpacklo_ps:
3463; AVX512:       # %bb.0:
3464; AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1]
3465; AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3466; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3467  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3468  ret <4 x float> %res
3469}
3470
3471define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3472; SSE-LABEL: test_mm_xor_ps:
3473; SSE:       # %bb.0:
3474; SSE-NEXT:    xorps %xmm1, %xmm0 # encoding: [0x0f,0x57,0xc1]
3475; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3476;
3477; AVX1-LABEL: test_mm_xor_ps:
3478; AVX1:       # %bb.0:
3479; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc1]
3480; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3481;
3482; AVX512-LABEL: test_mm_xor_ps:
3483; AVX512:       # %bb.0:
3484; AVX512-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1]
3485; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3486  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
3487  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
3488  %res = xor <4 x i32> %arg0, %arg1
3489  %bc = bitcast <4 x i32> %res to <4 x float>
3490  ret <4 x float> %bc
3491}
3492
3493!0 = !{i32 1}
3494