1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
3; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
4; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
5; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
6; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
7; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
8
9; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
10
11define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
12; SSE-LABEL: test_mm_add_ps:
13; SSE:       # %bb.0:
14; SSE-NEXT:    addps %xmm1, %xmm0 # encoding: [0x0f,0x58,0xc1]
15; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
16;
17; AVX1-LABEL: test_mm_add_ps:
18; AVX1:       # %bb.0:
19; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x58,0xc1]
20; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
21;
22; AVX512-LABEL: test_mm_add_ps:
23; AVX512:       # %bb.0:
24; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
25; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
26  %res = fadd <4 x float> %a0, %a1
27  ret <4 x float> %res
28}
29
30define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
31; SSE-LABEL: test_mm_add_ss:
32; SSE:       # %bb.0:
33; SSE-NEXT:    addss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x58,0xc1]
34; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
35;
36; AVX1-LABEL: test_mm_add_ss:
37; AVX1:       # %bb.0:
38; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x58,0xc1]
39; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
40;
41; AVX512-LABEL: test_mm_add_ss:
42; AVX512:       # %bb.0:
43; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1]
44; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
45  %ext0 = extractelement <4 x float> %a0, i32 0
46  %ext1 = extractelement <4 x float> %a1, i32 0
47  %fadd = fadd float %ext0, %ext1
48  %res = insertelement <4 x float> %a0, float %fadd, i32 0
49  ret <4 x float> %res
50}
51
52define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
53; SSE-LABEL: test_mm_and_ps:
54; SSE:       # %bb.0:
55; SSE-NEXT:    andps %xmm1, %xmm0 # encoding: [0x0f,0x54,0xc1]
56; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
57;
58; AVX1-LABEL: test_mm_and_ps:
59; AVX1:       # %bb.0:
60; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x54,0xc1]
61; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
62;
63; AVX512-LABEL: test_mm_and_ps:
64; AVX512:       # %bb.0:
65; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1]
66; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
67  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
68  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
69  %res = and <4 x i32> %arg0, %arg1
70  %bc = bitcast <4 x i32> %res to <4 x float>
71  ret <4 x float> %bc
72}
73
74define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
75; SSE-LABEL: test_mm_andnot_ps:
76; SSE:       # %bb.0:
77; SSE-NEXT:    andnps %xmm1, %xmm0 # encoding: [0x0f,0x55,0xc1]
78; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
79;
80; AVX1-LABEL: test_mm_andnot_ps:
81; AVX1:       # %bb.0:
82; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0x76,0xd2]
83; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xef,0xc2]
84; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xdb,0xc1]
85; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
86;
87; AVX512-LABEL: test_mm_andnot_ps:
88; AVX512:       # %bb.0:
89; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf3,0xfd,0x08,0x25,0xc0,0x0f]
90; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1]
91; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
92  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
93  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
94  %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
95  %res = and <4 x i32> %not, %arg1
96  %bc = bitcast <4 x i32> %res to <4 x float>
97  ret <4 x float> %bc
98}
99
100define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
101; SSE-LABEL: test_mm_cmpeq_ps:
102; SSE:       # %bb.0:
103; SSE-NEXT:    cmpeqps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x00]
104; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
105;
106; AVX1-LABEL: test_mm_cmpeq_ps:
107; AVX1:       # %bb.0:
108; AVX1-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x00]
109; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
110;
111; AVX512-LABEL: test_mm_cmpeq_ps:
112; AVX512:       # %bb.0:
113; AVX512-NEXT:    vcmpeqps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x00]
114; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
115; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
116  %cmp = fcmp oeq <4 x float> %a0, %a1
117  %sext = sext <4 x i1> %cmp to <4 x i32>
118  %res = bitcast <4 x i32> %sext to <4 x float>
119  ret <4 x float> %res
120}
121
122define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
123; SSE-LABEL: test_mm_cmpeq_ss:
124; SSE:       # %bb.0:
125; SSE-NEXT:    cmpeqss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x00]
126; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
127;
128; AVX-LABEL: test_mm_cmpeq_ss:
129; AVX:       # %bb.0:
130; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x00]
131; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
132  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
133  ret <4 x float> %res
134}
135declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
136
137define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
138; SSE-LABEL: test_mm_cmpge_ps:
139; SSE:       # %bb.0:
140; SSE-NEXT:    cmpleps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x02]
141; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
142; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
143;
144; AVX1-LABEL: test_mm_cmpge_ps:
145; AVX1:       # %bb.0:
146; AVX1-NEXT:    vcmpleps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x02]
147; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
148;
149; AVX512-LABEL: test_mm_cmpge_ps:
150; AVX512:       # %bb.0:
151; AVX512-NEXT:    vcmpleps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x02]
152; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
153; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
154  %cmp = fcmp ole <4 x float> %a1, %a0
155  %sext = sext <4 x i1> %cmp to <4 x i32>
156  %res = bitcast <4 x i32> %sext to <4 x float>
157  ret <4 x float> %res
158}
159
160define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
161; SSE-LABEL: test_mm_cmpge_ss:
162; SSE:       # %bb.0:
163; SSE-NEXT:    cmpless %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x02]
164; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
165; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
166; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
167;
168; AVX-LABEL: test_mm_cmpge_ss:
169; AVX:       # %bb.0:
170; AVX-NEXT:    vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02]
171; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
172; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
173; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
174  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
175  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
176  ret <4 x float> %res
177}
178
179define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
180; SSE-LABEL: test_mm_cmpgt_ps:
181; SSE:       # %bb.0:
182; SSE-NEXT:    cmpltps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x01]
183; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
184; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
185;
186; AVX1-LABEL: test_mm_cmpgt_ps:
187; AVX1:       # %bb.0:
188; AVX1-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x01]
189; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
190;
191; AVX512-LABEL: test_mm_cmpgt_ps:
192; AVX512:       # %bb.0:
193; AVX512-NEXT:    vcmpltps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x01]
194; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
195; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
196  %cmp = fcmp olt <4 x float> %a1, %a0
197  %sext = sext <4 x i1> %cmp to <4 x i32>
198  %res = bitcast <4 x i32> %sext to <4 x float>
199  ret <4 x float> %res
200}
201
202define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
203; SSE-LABEL: test_mm_cmpgt_ss:
204; SSE:       # %bb.0:
205; SSE-NEXT:    cmpltss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x01]
206; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
207; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
208; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
209;
210; AVX-LABEL: test_mm_cmpgt_ss:
211; AVX:       # %bb.0:
212; AVX-NEXT:    vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01]
213; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
214; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
215; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
216  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
217  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
218  ret <4 x float> %res
219}
220
221define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
222; SSE-LABEL: test_mm_cmple_ps:
223; SSE:       # %bb.0:
224; SSE-NEXT:    cmpleps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x02]
225; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
226;
227; AVX1-LABEL: test_mm_cmple_ps:
228; AVX1:       # %bb.0:
229; AVX1-NEXT:    vcmpleps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x02]
230; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
231;
232; AVX512-LABEL: test_mm_cmple_ps:
233; AVX512:       # %bb.0:
234; AVX512-NEXT:    vcmpleps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]
235; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
236; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
237  %cmp = fcmp ole <4 x float> %a0, %a1
238  %sext = sext <4 x i1> %cmp to <4 x i32>
239  %res = bitcast <4 x i32> %sext to <4 x float>
240  ret <4 x float> %res
241}
242
243define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
244; SSE-LABEL: test_mm_cmple_ss:
245; SSE:       # %bb.0:
246; SSE-NEXT:    cmpless %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x02]
247; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
248;
249; AVX-LABEL: test_mm_cmple_ss:
250; AVX:       # %bb.0:
251; AVX-NEXT:    vcmpless %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x02]
252; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
253  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2)
254  ret <4 x float> %res
255}
256
257define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
258; SSE-LABEL: test_mm_cmplt_ps:
259; SSE:       # %bb.0:
260; SSE-NEXT:    cmpltps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x01]
261; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
262;
263; AVX1-LABEL: test_mm_cmplt_ps:
264; AVX1:       # %bb.0:
265; AVX1-NEXT:    vcmpltps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x01]
266; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
267;
268; AVX512-LABEL: test_mm_cmplt_ps:
269; AVX512:       # %bb.0:
270; AVX512-NEXT:    vcmpltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x01]
271; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
272; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
273  %cmp = fcmp olt <4 x float> %a0, %a1
274  %sext = sext <4 x i1> %cmp to <4 x i32>
275  %res = bitcast <4 x i32> %sext to <4 x float>
276  ret <4 x float> %res
277}
278
279define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
280; SSE-LABEL: test_mm_cmplt_ss:
281; SSE:       # %bb.0:
282; SSE-NEXT:    cmpltss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x01]
283; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
284;
285; AVX-LABEL: test_mm_cmplt_ss:
286; AVX:       # %bb.0:
287; AVX-NEXT:    vcmpltss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x01]
288; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
289  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1)
290  ret <4 x float> %res
291}
292
293define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
294; SSE-LABEL: test_mm_cmpneq_ps:
295; SSE:       # %bb.0:
296; SSE-NEXT:    cmpneqps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x04]
297; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
298;
299; AVX1-LABEL: test_mm_cmpneq_ps:
300; AVX1:       # %bb.0:
301; AVX1-NEXT:    vcmpneqps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x04]
302; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
303;
304; AVX512-LABEL: test_mm_cmpneq_ps:
305; AVX512:       # %bb.0:
306; AVX512-NEXT:    vcmpneqps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x04]
307; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
308; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
309  %cmp = fcmp une <4 x float> %a0, %a1
310  %sext = sext <4 x i1> %cmp to <4 x i32>
311  %res = bitcast <4 x i32> %sext to <4 x float>
312  ret <4 x float> %res
313}
314
315define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
316; SSE-LABEL: test_mm_cmpneq_ss:
317; SSE:       # %bb.0:
318; SSE-NEXT:    cmpneqss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x04]
319; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
320;
321; AVX-LABEL: test_mm_cmpneq_ss:
322; AVX:       # %bb.0:
323; AVX-NEXT:    vcmpneqss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x04]
324; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
325  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4)
326  ret <4 x float> %res
327}
328
329define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
330; SSE-LABEL: test_mm_cmpnge_ps:
331; SSE:       # %bb.0:
332; SSE-NEXT:    cmpnleps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x06]
333; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
334; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
335;
336; AVX1-LABEL: test_mm_cmpnge_ps:
337; AVX1:       # %bb.0:
338; AVX1-NEXT:    vcmpnleps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x06]
339; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
340;
341; AVX512-LABEL: test_mm_cmpnge_ps:
342; AVX512:       # %bb.0:
343; AVX512-NEXT:    vcmpnleps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x06]
344; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
345; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
346  %cmp = fcmp ugt <4 x float> %a1, %a0
347  %sext = sext <4 x i1> %cmp to <4 x i32>
348  %res = bitcast <4 x i32> %sext to <4 x float>
349  ret <4 x float> %res
350}
351
352define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
353; SSE-LABEL: test_mm_cmpnge_ss:
354; SSE:       # %bb.0:
355; SSE-NEXT:    cmpnless %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x06]
356; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
357; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
358; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
359;
360; AVX-LABEL: test_mm_cmpnge_ss:
361; AVX:       # %bb.0:
362; AVX-NEXT:    vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06]
363; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
364; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
365; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
366  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
367  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
368  ret <4 x float> %res
369}
370
371define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
372; SSE-LABEL: test_mm_cmpngt_ps:
373; SSE:       # %bb.0:
374; SSE-NEXT:    cmpnltps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x05]
375; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
376; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
377;
378; AVX1-LABEL: test_mm_cmpngt_ps:
379; AVX1:       # %bb.0:
380; AVX1-NEXT:    vcmpnltps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x05]
381; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
382;
383; AVX512-LABEL: test_mm_cmpngt_ps:
384; AVX512:       # %bb.0:
385; AVX512-NEXT:    vcmpnltps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x05]
386; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
387; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
388  %cmp = fcmp uge <4 x float> %a1, %a0
389  %sext = sext <4 x i1> %cmp to <4 x i32>
390  %res = bitcast <4 x i32> %sext to <4 x float>
391  ret <4 x float> %res
392}
393
394define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
395; SSE-LABEL: test_mm_cmpngt_ss:
396; SSE:       # %bb.0:
397; SSE-NEXT:    cmpnltss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x05]
398; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
399; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
400; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
401;
402; AVX-LABEL: test_mm_cmpngt_ss:
403; AVX:       # %bb.0:
404; AVX-NEXT:    vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05]
405; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
406; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
407; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
408  %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
409  %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
410  ret <4 x float> %res
411}
412
413define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
414; SSE-LABEL: test_mm_cmpnle_ps:
415; SSE:       # %bb.0:
416; SSE-NEXT:    cmpnleps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x06]
417; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
418;
419; AVX1-LABEL: test_mm_cmpnle_ps:
420; AVX1:       # %bb.0:
421; AVX1-NEXT:    vcmpnleps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x06]
422; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
423;
424; AVX512-LABEL: test_mm_cmpnle_ps:
425; AVX512:       # %bb.0:
426; AVX512-NEXT:    vcmpnleps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x06]
427; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
428; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
429  %cmp = fcmp ugt <4 x float> %a0, %a1
430  %sext = sext <4 x i1> %cmp to <4 x i32>
431  %res = bitcast <4 x i32> %sext to <4 x float>
432  ret <4 x float> %res
433}
434
435define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
436; SSE-LABEL: test_mm_cmpnle_ss:
437; SSE:       # %bb.0:
438; SSE-NEXT:    cmpnless %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x06]
439; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
440;
441; AVX-LABEL: test_mm_cmpnle_ss:
442; AVX:       # %bb.0:
443; AVX-NEXT:    vcmpnless %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x06]
444; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
445  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6)
446  ret <4 x float> %res
447}
448
449define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
450; SSE-LABEL: test_mm_cmpnlt_ps:
451; SSE:       # %bb.0:
452; SSE-NEXT:    cmpnltps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x05]
453; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
454;
455; AVX1-LABEL: test_mm_cmpnlt_ps:
456; AVX1:       # %bb.0:
457; AVX1-NEXT:    vcmpnltps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x05]
458; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
459;
460; AVX512-LABEL: test_mm_cmpnlt_ps:
461; AVX512:       # %bb.0:
462; AVX512-NEXT:    vcmpnltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x05]
463; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
464; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
465  %cmp = fcmp uge <4 x float> %a0, %a1
466  %sext = sext <4 x i1> %cmp to <4 x i32>
467  %res = bitcast <4 x i32> %sext to <4 x float>
468  ret <4 x float> %res
469}
470
471define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
472; SSE-LABEL: test_mm_cmpnlt_ss:
473; SSE:       # %bb.0:
474; SSE-NEXT:    cmpnltss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x05]
475; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
476;
477; AVX-LABEL: test_mm_cmpnlt_ss:
478; AVX:       # %bb.0:
479; AVX-NEXT:    vcmpnltss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x05]
480; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
481  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5)
482  ret <4 x float> %res
483}
484
485define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
486; SSE-LABEL: test_mm_cmpord_ps:
487; SSE:       # %bb.0:
488; SSE-NEXT:    cmpordps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x07]
489; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
490;
491; AVX1-LABEL: test_mm_cmpord_ps:
492; AVX1:       # %bb.0:
493; AVX1-NEXT:    vcmpordps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x07]
494; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
495;
496; AVX512-LABEL: test_mm_cmpord_ps:
497; AVX512:       # %bb.0:
498; AVX512-NEXT:    vcmpordps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x07]
499; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
500; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
501  %cmp = fcmp ord <4 x float> %a0, %a1
502  %sext = sext <4 x i1> %cmp to <4 x i32>
503  %res = bitcast <4 x i32> %sext to <4 x float>
504  ret <4 x float> %res
505}
506
507define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
508; SSE-LABEL: test_mm_cmpord_ss:
509; SSE:       # %bb.0:
510; SSE-NEXT:    cmpordss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x07]
511; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
512;
513; AVX-LABEL: test_mm_cmpord_ss:
514; AVX:       # %bb.0:
515; AVX-NEXT:    vcmpordss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x07]
516; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
517  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7)
518  ret <4 x float> %res
519}
520
521define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
522; SSE-LABEL: test_mm_cmpunord_ps:
523; SSE:       # %bb.0:
524; SSE-NEXT:    cmpunordps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x03]
525; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
526;
527; AVX1-LABEL: test_mm_cmpunord_ps:
528; AVX1:       # %bb.0:
529; AVX1-NEXT:    vcmpunordps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x03]
530; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
531;
532; AVX512-LABEL: test_mm_cmpunord_ps:
533; AVX512:       # %bb.0:
534; AVX512-NEXT:    vcmpunordps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x03]
535; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
536; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
537  %cmp = fcmp uno <4 x float> %a0, %a1
538  %sext = sext <4 x i1> %cmp to <4 x i32>
539  %res = bitcast <4 x i32> %sext to <4 x float>
540  ret <4 x float> %res
541}
542
543define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
544; SSE-LABEL: test_mm_cmpunord_ss:
545; SSE:       # %bb.0:
546; SSE-NEXT:    cmpunordss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x03]
547; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
548;
549; AVX-LABEL: test_mm_cmpunord_ss:
550; AVX:       # %bb.0:
551; AVX-NEXT:    vcmpunordss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x03]
552; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
553  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3)
554  ret <4 x float> %res
555}
556
557define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
558; SSE-LABEL: test_mm_comieq_ss:
559; SSE:       # %bb.0:
560; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
561; SSE-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
562; SSE-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
563; SSE-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
564; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
565; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
566;
567; AVX1-LABEL: test_mm_comieq_ss:
568; AVX1:       # %bb.0:
569; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
570; AVX1-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
571; AVX1-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
572; AVX1-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
573; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
574; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
575;
576; AVX512-LABEL: test_mm_comieq_ss:
577; AVX512:       # %bb.0:
578; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
579; AVX512-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
580; AVX512-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
581; AVX512-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
582; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
583; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
584  %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
585  ret i32 %res
586}
587declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
588
589define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
590; SSE-LABEL: test_mm_comige_ss:
591; SSE:       # %bb.0:
592; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
593; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
594; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
595; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
596;
597; AVX1-LABEL: test_mm_comige_ss:
598; AVX1:       # %bb.0:
599; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
600; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
601; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
602; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
603;
604; AVX512-LABEL: test_mm_comige_ss:
605; AVX512:       # %bb.0:
606; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
607; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
608; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
609; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
610  %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1)
611  ret i32 %res
612}
613declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
614
615define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
616; SSE-LABEL: test_mm_comigt_ss:
617; SSE:       # %bb.0:
618; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
619; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
620; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
621; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
622;
623; AVX1-LABEL: test_mm_comigt_ss:
624; AVX1:       # %bb.0:
625; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
626; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
627; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
628; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
629;
630; AVX512-LABEL: test_mm_comigt_ss:
631; AVX512:       # %bb.0:
632; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
633; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
634; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
635; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
636  %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1)
637  ret i32 %res
638}
639declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
640
641define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
642; SSE-LABEL: test_mm_comile_ss:
643; SSE:       # %bb.0:
644; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
645; SSE-NEXT:    comiss %xmm0, %xmm1 # encoding: [0x0f,0x2f,0xc8]
646; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
647; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
648;
649; AVX1-LABEL: test_mm_comile_ss:
650; AVX1:       # %bb.0:
651; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
652; AVX1-NEXT:    vcomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2f,0xc8]
653; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
654; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
655;
656; AVX512-LABEL: test_mm_comile_ss:
657; AVX512:       # %bb.0:
658; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
659; AVX512-NEXT:    vcomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
660; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
661; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
662  %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1)
663  ret i32 %res
664}
665declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
666
667define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
668; SSE-LABEL: test_mm_comilt_ss:
669; SSE:       # %bb.0:
670; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
671; SSE-NEXT:    comiss %xmm0, %xmm1 # encoding: [0x0f,0x2f,0xc8]
672; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
673; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
674;
675; AVX1-LABEL: test_mm_comilt_ss:
676; AVX1:       # %bb.0:
677; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
678; AVX1-NEXT:    vcomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2f,0xc8]
679; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
680; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
681;
682; AVX512-LABEL: test_mm_comilt_ss:
683; AVX512:       # %bb.0:
684; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
685; AVX512-NEXT:    vcomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
686; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
687; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
688  %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1)
689  ret i32 %res
690}
691declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
692
693define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
694; SSE-LABEL: test_mm_comineq_ss:
695; SSE:       # %bb.0:
696; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
697; SSE-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
698; SSE-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
699; SSE-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
700; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
701; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
702;
703; AVX1-LABEL: test_mm_comineq_ss:
704; AVX1:       # %bb.0:
705; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
706; AVX1-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
707; AVX1-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
708; AVX1-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
709; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
710; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
711;
712; AVX512-LABEL: test_mm_comineq_ss:
713; AVX512:       # %bb.0:
714; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
715; AVX512-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
716; AVX512-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
717; AVX512-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
718; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
719; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
720  %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1)
721  ret i32 %res
722}
723declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
724
725define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind {
726; SSE-LABEL: test_mm_cvt_ss2si:
727; SSE:       # %bb.0:
728; SSE-NEXT:    cvtss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2d,0xc0]
729; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
730;
731; AVX1-LABEL: test_mm_cvt_ss2si:
732; AVX1:       # %bb.0:
733; AVX1-NEXT:    vcvtss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2d,0xc0]
734; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
735;
736; AVX512-LABEL: test_mm_cvt_ss2si:
737; AVX512:       # %bb.0:
738; AVX512-NEXT:    vcvtss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0]
739; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
740  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
741  ret i32 %res
742}
743declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
744
745define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
746; X86-SSE-LABEL: test_mm_cvtsi32_ss:
747; X86-SSE:       # %bb.0:
748; X86-SSE-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x2a,0x44,0x24,0x04]
749; X86-SSE-NEXT:    retl # encoding: [0xc3]
750;
751; X86-AVX1-LABEL: test_mm_cvtsi32_ss:
752; X86-AVX1:       # %bb.0:
753; X86-AVX1-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x2a,0x44,0x24,0x04]
754; X86-AVX1-NEXT:    retl # encoding: [0xc3]
755;
756; X86-AVX512-LABEL: test_mm_cvtsi32_ss:
757; X86-AVX512:       # %bb.0:
758; X86-AVX512-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0x44,0x24,0x04]
759; X86-AVX512-NEXT:    retl # encoding: [0xc3]
760;
761; X64-SSE-LABEL: test_mm_cvtsi32_ss:
762; X64-SSE:       # %bb.0:
763; X64-SSE-NEXT:    cvtsi2ss %edi, %xmm0 # encoding: [0xf3,0x0f,0x2a,0xc7]
764; X64-SSE-NEXT:    retq # encoding: [0xc3]
765;
766; X64-AVX1-LABEL: test_mm_cvtsi32_ss:
767; X64-AVX1:       # %bb.0:
768; X64-AVX1-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x2a,0xc7]
769; X64-AVX1-NEXT:    retq # encoding: [0xc3]
770;
771; X64-AVX512-LABEL: test_mm_cvtsi32_ss:
772; X64-AVX512:       # %bb.0:
773; X64-AVX512-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0xc7]
774; X64-AVX512-NEXT:    retq # encoding: [0xc3]
775  %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
776  ret <4 x float> %res
777}
778declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
779
780define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
781; X86-SSE-LABEL: test_mm_cvtss_f32:
782; X86-SSE:       # %bb.0:
783; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
784; X86-SSE-NEXT:    movss %xmm0, (%esp) # encoding: [0xf3,0x0f,0x11,0x04,0x24]
785; X86-SSE-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
786; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
787; X86-SSE-NEXT:    retl # encoding: [0xc3]
788;
789; X86-AVX1-LABEL: test_mm_cvtss_f32:
790; X86-AVX1:       # %bb.0:
791; X86-AVX1-NEXT:    pushl %eax # encoding: [0x50]
792; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) # encoding: [0xc5,0xfa,0x11,0x04,0x24]
793; X86-AVX1-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
794; X86-AVX1-NEXT:    popl %eax # encoding: [0x58]
795; X86-AVX1-NEXT:    retl # encoding: [0xc3]
796;
797; X86-AVX512-LABEL: test_mm_cvtss_f32:
798; X86-AVX512:       # %bb.0:
799; X86-AVX512-NEXT:    pushl %eax # encoding: [0x50]
800; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
801; X86-AVX512-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
802; X86-AVX512-NEXT:    popl %eax # encoding: [0x58]
803; X86-AVX512-NEXT:    retl # encoding: [0xc3]
804;
805; X64-LABEL: test_mm_cvtss_f32:
806; X64:       # %bb.0:
807; X64-NEXT:    retq # encoding: [0xc3]
808  %res = extractelement <4 x float> %a0, i32 0
809  ret float %res
810}
811
812define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind {
813; SSE-LABEL: test_mm_cvtss_si32:
814; SSE:       # %bb.0:
815; SSE-NEXT:    cvtss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2d,0xc0]
816; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
817;
818; AVX1-LABEL: test_mm_cvtss_si32:
819; AVX1:       # %bb.0:
820; AVX1-NEXT:    vcvtss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2d,0xc0]
821; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
822;
823; AVX512-LABEL: test_mm_cvtss_si32:
824; AVX512:       # %bb.0:
825; AVX512-NEXT:    vcvtss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0]
826; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
827  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
828  ret i32 %res
829}
830
831define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
832; SSE-LABEL: test_mm_cvttss_si:
833; SSE:       # %bb.0:
834; SSE-NEXT:    cvttss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2c,0xc0]
835; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
836;
837; AVX1-LABEL: test_mm_cvttss_si:
838; AVX1:       # %bb.0:
839; AVX1-NEXT:    vcvttss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2c,0xc0]
840; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
841;
842; AVX512-LABEL: test_mm_cvttss_si:
843; AVX512:       # %bb.0:
844; AVX512-NEXT:    vcvttss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0]
845; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
846  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
847  ret i32 %res
848}
849declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
850
851define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
852; SSE-LABEL: test_mm_cvttss_si32:
853; SSE:       # %bb.0:
854; SSE-NEXT:    cvttss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2c,0xc0]
855; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
856;
857; AVX1-LABEL: test_mm_cvttss_si32:
858; AVX1:       # %bb.0:
859; AVX1-NEXT:    vcvttss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2c,0xc0]
860; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
861;
862; AVX512-LABEL: test_mm_cvttss_si32:
863; AVX512:       # %bb.0:
864; AVX512-NEXT:    vcvttss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0]
865; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
866  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
867  ret i32 %res
868}
869
870define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
871; SSE-LABEL: test_mm_div_ps:
872; SSE:       # %bb.0:
873; SSE-NEXT:    divps %xmm1, %xmm0 # encoding: [0x0f,0x5e,0xc1]
874; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
875;
876; AVX1-LABEL: test_mm_div_ps:
877; AVX1:       # %bb.0:
878; AVX1-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5e,0xc1]
879; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
880;
881; AVX512-LABEL: test_mm_div_ps:
882; AVX512:       # %bb.0:
883; AVX512-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5e,0xc1]
884; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
885  %res = fdiv <4 x float> %a0, %a1
886  ret <4 x float> %res
887}
888
889define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
890; SSE-LABEL: test_mm_div_ss:
891; SSE:       # %bb.0:
892; SSE-NEXT:    divss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5e,0xc1]
893; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
894;
895; AVX1-LABEL: test_mm_div_ss:
896; AVX1:       # %bb.0:
897; AVX1-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5e,0xc1]
898; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
899;
900; AVX512-LABEL: test_mm_div_ss:
901; AVX512:       # %bb.0:
902; AVX512-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5e,0xc1]
903; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
904  %ext0 = extractelement <4 x float> %a0, i32 0
905  %ext1 = extractelement <4 x float> %a1, i32 0
906  %fdiv = fdiv float %ext0, %ext1
907  %res = insertelement <4 x float> %a0, float %fdiv, i32 0
908  ret <4 x float> %res
909}
910
911define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
912; X86-SSE-LABEL: test_MM_GET_EXCEPTION_MASK:
913; X86-SSE:       # %bb.0:
914; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
915; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
916; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
917; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
918; X86-SSE-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
919; X86-SSE-NEXT:    # imm = 0x1F80
920; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
921; X86-SSE-NEXT:    retl # encoding: [0xc3]
922;
923; X86-AVX-LABEL: test_MM_GET_EXCEPTION_MASK:
924; X86-AVX:       # %bb.0:
925; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
926; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
927; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
928; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
929; X86-AVX-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
930; X86-AVX-NEXT:    # imm = 0x1F80
931; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
932; X86-AVX-NEXT:    retl # encoding: [0xc3]
933;
934; X64-SSE-LABEL: test_MM_GET_EXCEPTION_MASK:
935; X64-SSE:       # %bb.0:
936; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
937; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
938; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
939; X64-SSE-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
940; X64-SSE-NEXT:    # imm = 0x1F80
941; X64-SSE-NEXT:    retq # encoding: [0xc3]
942;
943; X64-AVX-LABEL: test_MM_GET_EXCEPTION_MASK:
944; X64-AVX:       # %bb.0:
945; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
946; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
947; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
948; X64-AVX-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
949; X64-AVX-NEXT:    # imm = 0x1F80
950; X64-AVX-NEXT:    retq # encoding: [0xc3]
951  %1 = alloca i32, align 4
952  %2 = bitcast i32* %1 to i8*
953  call void @llvm.x86.sse.stmxcsr(i8* %2)
954  %3 = load i32, i32* %1, align 4
955  %4 = and i32 %3, 8064
956  ret i32 %4
957}
958declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone
959
960define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
961; X86-SSE-LABEL: test_MM_GET_EXCEPTION_STATE:
962; X86-SSE:       # %bb.0:
963; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
964; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
965; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
966; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
967; X86-SSE-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
968; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
969; X86-SSE-NEXT:    retl # encoding: [0xc3]
970;
971; X86-AVX-LABEL: test_MM_GET_EXCEPTION_STATE:
972; X86-AVX:       # %bb.0:
973; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
974; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
975; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
976; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
977; X86-AVX-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
978; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
979; X86-AVX-NEXT:    retl # encoding: [0xc3]
980;
981; X64-SSE-LABEL: test_MM_GET_EXCEPTION_STATE:
982; X64-SSE:       # %bb.0:
983; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
984; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
985; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
986; X64-SSE-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
987; X64-SSE-NEXT:    retq # encoding: [0xc3]
988;
989; X64-AVX-LABEL: test_MM_GET_EXCEPTION_STATE:
990; X64-AVX:       # %bb.0:
991; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
992; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
993; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
994; X64-AVX-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
995; X64-AVX-NEXT:    retq # encoding: [0xc3]
996  %1 = alloca i32, align 4
997  %2 = bitcast i32* %1 to i8*
998  call void @llvm.x86.sse.stmxcsr(i8* %2)
999  %3 = load i32, i32* %1, align 4
1000  %4 = and i32 %3, 63
1001  ret i32 %4
1002}
1003
1004define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
1005; X86-SSE-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1006; X86-SSE:       # %bb.0:
1007; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1008; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1009; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
1010; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1011; X86-SSE-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1012; X86-SSE-NEXT:    # imm = 0x8000
1013; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
1014; X86-SSE-NEXT:    retl # encoding: [0xc3]
1015;
1016; X86-AVX-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1017; X86-AVX:       # %bb.0:
1018; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1019; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1020; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
1021; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1022; X86-AVX-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1023; X86-AVX-NEXT:    # imm = 0x8000
1024; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
1025; X86-AVX-NEXT:    retl # encoding: [0xc3]
1026;
1027; X64-SSE-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1028; X64-SSE:       # %bb.0:
1029; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1030; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1031; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1032; X64-SSE-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1033; X64-SSE-NEXT:    # imm = 0x8000
1034; X64-SSE-NEXT:    retq # encoding: [0xc3]
1035;
1036; X64-AVX-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
1037; X64-AVX:       # %bb.0:
1038; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1039; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1040; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1041; X64-AVX-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
1042; X64-AVX-NEXT:    # imm = 0x8000
1043; X64-AVX-NEXT:    retq # encoding: [0xc3]
1044  %1 = alloca i32, align 4
1045  %2 = bitcast i32* %1 to i8*
1046  call void @llvm.x86.sse.stmxcsr(i8* %2)
1047  %3 = load i32, i32* %1, align 4
1048  %4 = and i32 %3, 32768
1049  ret i32 %4
1050}
1051
1052define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
1053; X86-SSE-LABEL: test_MM_GET_ROUNDING_MODE:
1054; X86-SSE:       # %bb.0:
1055; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1056; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1057; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
1058; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1059; X86-SSE-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1060; X86-SSE-NEXT:    # imm = 0x6000
1061; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
1062; X86-SSE-NEXT:    retl # encoding: [0xc3]
1063;
1064; X86-AVX-LABEL: test_MM_GET_ROUNDING_MODE:
1065; X86-AVX:       # %bb.0:
1066; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1067; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1068; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
1069; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1070; X86-AVX-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1071; X86-AVX-NEXT:    # imm = 0x6000
1072; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
1073; X86-AVX-NEXT:    retl # encoding: [0xc3]
1074;
1075; X64-SSE-LABEL: test_MM_GET_ROUNDING_MODE:
1076; X64-SSE:       # %bb.0:
1077; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1078; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1079; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1080; X64-SSE-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1081; X64-SSE-NEXT:    # imm = 0x6000
1082; X64-SSE-NEXT:    retq # encoding: [0xc3]
1083;
1084; X64-AVX-LABEL: test_MM_GET_ROUNDING_MODE:
1085; X64-AVX:       # %bb.0:
1086; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1087; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1088; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1089; X64-AVX-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
1090; X64-AVX-NEXT:    # imm = 0x6000
1091; X64-AVX-NEXT:    retq # encoding: [0xc3]
1092  %1 = alloca i32, align 4
1093  %2 = bitcast i32* %1 to i8*
1094  call void @llvm.x86.sse.stmxcsr(i8* %2)
1095  %3 = load i32, i32* %1, align 4
1096  %4 = and i32 %3, 24576
1097  ret i32 %4
1098}
1099
1100define i32 @test_mm_getcsr() nounwind {
1101; X86-SSE-LABEL: test_mm_getcsr:
1102; X86-SSE:       # %bb.0:
1103; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1104; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1105; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
1106; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1107; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
1108; X86-SSE-NEXT:    retl # encoding: [0xc3]
1109;
1110; X86-AVX-LABEL: test_mm_getcsr:
1111; X86-AVX:       # %bb.0:
1112; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1113; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
1114; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
1115; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
1116; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
1117; X86-AVX-NEXT:    retl # encoding: [0xc3]
1118;
1119; X64-SSE-LABEL: test_mm_getcsr:
1120; X64-SSE:       # %bb.0:
1121; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1122; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1123; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1124; X64-SSE-NEXT:    retq # encoding: [0xc3]
1125;
1126; X64-AVX-LABEL: test_mm_getcsr:
1127; X64-AVX:       # %bb.0:
1128; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1129; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1130; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
1131; X64-AVX-NEXT:    retq # encoding: [0xc3]
1132  %1 = alloca i32, align 4
1133  %2 = bitcast i32* %1 to i8*
1134  call void @llvm.x86.sse.stmxcsr(i8* %2)
1135  %3 = load i32, i32* %1, align 4
1136  ret i32 %3
1137}
1138
1139define <4 x float> @test_mm_load_ps(float* %a0) nounwind {
1140; X86-SSE-LABEL: test_mm_load_ps:
1141; X86-SSE:       # %bb.0:
1142; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1143; X86-SSE-NEXT:    movaps (%eax), %xmm0 # encoding: [0x0f,0x28,0x00]
1144; X86-SSE-NEXT:    retl # encoding: [0xc3]
1145;
1146; X86-AVX1-LABEL: test_mm_load_ps:
1147; X86-AVX1:       # %bb.0:
1148; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1149; X86-AVX1-NEXT:    vmovaps (%eax), %xmm0 # encoding: [0xc5,0xf8,0x28,0x00]
1150; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1151;
1152; X86-AVX512-LABEL: test_mm_load_ps:
1153; X86-AVX512:       # %bb.0:
1154; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1155; X86-AVX512-NEXT:    vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00]
1156; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1157;
1158; X64-SSE-LABEL: test_mm_load_ps:
1159; X64-SSE:       # %bb.0:
1160; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
1161; X64-SSE-NEXT:    retq # encoding: [0xc3]
1162;
1163; X64-AVX1-LABEL: test_mm_load_ps:
1164; X64-AVX1:       # %bb.0:
1165; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07]
1166; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1167;
1168; X64-AVX512-LABEL: test_mm_load_ps:
1169; X64-AVX512:       # %bb.0:
1170; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
1171; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1172  %arg0 = bitcast float* %a0 to <4 x float>*
1173  %res = load <4 x float>, <4 x float>* %arg0, align 16
1174  ret <4 x float> %res
1175}
1176
1177define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
1178; X86-SSE-LABEL: test_mm_load_ps1:
1179; X86-SSE:       # %bb.0:
1180; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1181; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
1182; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1183; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1184; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1185; X86-SSE-NEXT:    retl # encoding: [0xc3]
1186;
1187; X86-AVX1-LABEL: test_mm_load_ps1:
1188; X86-AVX1:       # %bb.0:
1189; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1190; X86-AVX1-NEXT:    vbroadcastss (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x00]
1191; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1192;
1193; X86-AVX512-LABEL: test_mm_load_ps1:
1194; X86-AVX512:       # %bb.0:
1195; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1196; X86-AVX512-NEXT:    vbroadcastss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x00]
1197; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1198;
1199; X64-SSE-LABEL: test_mm_load_ps1:
1200; X64-SSE:       # %bb.0:
1201; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
1202; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1203; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1204; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1205; X64-SSE-NEXT:    retq # encoding: [0xc3]
1206;
1207; X64-AVX1-LABEL: test_mm_load_ps1:
1208; X64-AVX1:       # %bb.0:
1209; X64-AVX1-NEXT:    vbroadcastss (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x07]
1210; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1211;
1212; X64-AVX512-LABEL: test_mm_load_ps1:
1213; X64-AVX512:       # %bb.0:
1214; X64-AVX512-NEXT:    vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07]
1215; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1216  %ld = load float, float* %a0, align 4
1217  %res0 = insertelement <4 x float> undef, float %ld, i32 0
1218  %res1 = insertelement <4 x float> %res0, float %ld, i32 1
1219  %res2 = insertelement <4 x float> %res1, float %ld, i32 2
1220  %res3 = insertelement <4 x float> %res2, float %ld, i32 3
1221  ret <4 x float> %res3
1222}
1223
1224define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
1225; X86-SSE-LABEL: test_mm_load_ss:
1226; X86-SSE:       # %bb.0:
1227; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1228; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
1229; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1230; X86-SSE-NEXT:    retl # encoding: [0xc3]
1231;
1232; X86-AVX1-LABEL: test_mm_load_ss:
1233; X86-AVX1:       # %bb.0:
1234; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1235; X86-AVX1-NEXT:    vmovss (%eax), %xmm0 # encoding: [0xc5,0xfa,0x10,0x00]
1236; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
1237; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1238;
1239; X86-AVX512-LABEL: test_mm_load_ss:
1240; X86-AVX512:       # %bb.0:
1241; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1242; X86-AVX512-NEXT:    vmovss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00]
1243; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
1244; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1245;
1246; X64-SSE-LABEL: test_mm_load_ss:
1247; X64-SSE:       # %bb.0:
1248; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
1249; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1250; X64-SSE-NEXT:    retq # encoding: [0xc3]
1251;
1252; X64-AVX1-LABEL: test_mm_load_ss:
1253; X64-AVX1:       # %bb.0:
1254; X64-AVX1-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
1255; X64-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
1256; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1257;
1258; X64-AVX512-LABEL: test_mm_load_ss:
1259; X64-AVX512:       # %bb.0:
1260; X64-AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
1261; X64-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
1262; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1263  %ld = load float, float* %a0, align 1
1264  %res0 = insertelement <4 x float> undef, float %ld, i32 0
1265  %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
1266  %res2 = insertelement <4 x float> %res1, float 0.0, i32 2
1267  %res3 = insertelement <4 x float> %res2, float 0.0, i32 3
1268  ret <4 x float> %res3
1269}
1270
1271define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
1272; X86-SSE-LABEL: test_mm_load1_ps:
1273; X86-SSE:       # %bb.0:
1274; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1275; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
1276; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1277; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1278; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1279; X86-SSE-NEXT:    retl # encoding: [0xc3]
1280;
1281; X86-AVX1-LABEL: test_mm_load1_ps:
1282; X86-AVX1:       # %bb.0:
1283; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1284; X86-AVX1-NEXT:    vbroadcastss (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x00]
1285; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1286;
1287; X86-AVX512-LABEL: test_mm_load1_ps:
1288; X86-AVX512:       # %bb.0:
1289; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1290; X86-AVX512-NEXT:    vbroadcastss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x00]
1291; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1292;
1293; X64-SSE-LABEL: test_mm_load1_ps:
1294; X64-SSE:       # %bb.0:
1295; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
1296; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1297; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
1298; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
1299; X64-SSE-NEXT:    retq # encoding: [0xc3]
1300;
1301; X64-AVX1-LABEL: test_mm_load1_ps:
1302; X64-AVX1:       # %bb.0:
1303; X64-AVX1-NEXT:    vbroadcastss (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x07]
1304; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1305;
1306; X64-AVX512-LABEL: test_mm_load1_ps:
1307; X64-AVX512:       # %bb.0:
1308; X64-AVX512-NEXT:    vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07]
1309; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1310  %ld = load float, float* %a0, align 4
1311  %res0 = insertelement <4 x float> undef, float %ld, i32 0
1312  %res1 = insertelement <4 x float> %res0, float %ld, i32 1
1313  %res2 = insertelement <4 x float> %res1, float %ld, i32 2
1314  %res3 = insertelement <4 x float> %res2, float %ld, i32 3
1315  ret <4 x float> %res3
1316}
1317
1318define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
1319; X86-SSE-LABEL: test_mm_loadh_pi:
1320; X86-SSE:       # %bb.0:
1321; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1322; X86-SSE-NEXT:    movhps (%eax), %xmm0 # encoding: [0x0f,0x16,0x00]
1323; X86-SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
1324; X86-SSE-NEXT:    retl # encoding: [0xc3]
1325;
1326; X86-AVX1-LABEL: test_mm_loadh_pi:
1327; X86-AVX1:       # %bb.0:
1328; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1329; X86-AVX1-NEXT:    vmovhps (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0x00]
1330; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
1331; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1332;
1333; X86-AVX512-LABEL: test_mm_loadh_pi:
1334; X86-AVX512:       # %bb.0:
1335; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1336; X86-AVX512-NEXT:    vmovhps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0x00]
1337; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
1338; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1339;
1340; X64-SSE-LABEL: test_mm_loadh_pi:
1341; X64-SSE:       # %bb.0:
1342; X64-SSE-NEXT:    movhps (%rdi), %xmm0 # encoding: [0x0f,0x16,0x07]
1343; X64-SSE-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
1344; X64-SSE-NEXT:    retq # encoding: [0xc3]
1345;
1346; X64-AVX1-LABEL: test_mm_loadh_pi:
1347; X64-AVX1:       # %bb.0:
1348; X64-AVX1-NEXT:    vmovhps (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0x07]
1349; X64-AVX1-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
1350; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1351;
1352; X64-AVX512-LABEL: test_mm_loadh_pi:
1353; X64-AVX512:       # %bb.0:
1354; X64-AVX512-NEXT:    vmovhps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0x07]
1355; X64-AVX512-NEXT:    # xmm0 = xmm0[0,1],mem[0,1]
1356; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1357  %ptr = bitcast x86_mmx* %a1 to <2 x float>*
1358  %ld  = load <2 x float>, <2 x float>* %ptr
1359  %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1360  %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1361  ret <4 x float> %res
1362}
1363
1364define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
1365; X86-SSE-LABEL: test_mm_loadl_pi:
1366; X86-SSE:       # %bb.0:
1367; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1368; X86-SSE-NEXT:    movlps (%eax), %xmm0 # encoding: [0x0f,0x12,0x00]
1369; X86-SSE-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
1370; X86-SSE-NEXT:    retl # encoding: [0xc3]
1371;
1372; X86-AVX1-LABEL: test_mm_loadl_pi:
1373; X86-AVX1:       # %bb.0:
1374; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1375; X86-AVX1-NEXT:    vmovlps (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x12,0x00]
1376; X86-AVX1-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
1377; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1378;
1379; X86-AVX512-LABEL: test_mm_loadl_pi:
1380; X86-AVX512:       # %bb.0:
1381; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1382; X86-AVX512-NEXT:    vmovlps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x12,0x00]
1383; X86-AVX512-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
1384; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1385;
1386; X64-SSE-LABEL: test_mm_loadl_pi:
1387; X64-SSE:       # %bb.0:
1388; X64-SSE-NEXT:    movlps (%rdi), %xmm0 # encoding: [0x0f,0x12,0x07]
1389; X64-SSE-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
1390; X64-SSE-NEXT:    retq # encoding: [0xc3]
1391;
1392; X64-AVX1-LABEL: test_mm_loadl_pi:
1393; X64-AVX1:       # %bb.0:
1394; X64-AVX1-NEXT:    vmovlps (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x12,0x07]
1395; X64-AVX1-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
1396; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1397;
1398; X64-AVX512-LABEL: test_mm_loadl_pi:
1399; X64-AVX512:       # %bb.0:
1400; X64-AVX512-NEXT:    vmovlps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x12,0x07]
1401; X64-AVX512-NEXT:    # xmm0 = mem[0,1],xmm0[2,3]
1402; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1403  %ptr = bitcast x86_mmx* %a1 to <2 x float>*
1404  %ld  = load <2 x float>, <2 x float>* %ptr
1405  %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1406  %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1407  ret <4 x float> %res
1408}
1409
1410define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
1411; X86-SSE-LABEL: test_mm_loadr_ps:
1412; X86-SSE:       # %bb.0:
1413; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1414; X86-SSE-NEXT:    movaps (%eax), %xmm0 # encoding: [0x0f,0x28,0x00]
1415; X86-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
1416; X86-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
1417; X86-SSE-NEXT:    retl # encoding: [0xc3]
1418;
1419; X86-AVX1-LABEL: test_mm_loadr_ps:
1420; X86-AVX1:       # %bb.0:
1421; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1422; X86-AVX1-NEXT:    vpermilps $27, (%eax), %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0x00,0x1b]
1423; X86-AVX1-NEXT:    # xmm0 = mem[3,2,1,0]
1424; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1425;
1426; X86-AVX512-LABEL: test_mm_loadr_ps:
1427; X86-AVX512:       # %bb.0:
1428; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1429; X86-AVX512-NEXT:    vpermilps $27, (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x00,0x1b]
1430; X86-AVX512-NEXT:    # xmm0 = mem[3,2,1,0]
1431; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1432;
1433; X64-SSE-LABEL: test_mm_loadr_ps:
1434; X64-SSE:       # %bb.0:
1435; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
1436; X64-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
1437; X64-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
1438; X64-SSE-NEXT:    retq # encoding: [0xc3]
1439;
1440; X64-AVX1-LABEL: test_mm_loadr_ps:
1441; X64-AVX1:       # %bb.0:
1442; X64-AVX1-NEXT:    vpermilps $27, (%rdi), %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0x07,0x1b]
1443; X64-AVX1-NEXT:    # xmm0 = mem[3,2,1,0]
1444; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1445;
1446; X64-AVX512-LABEL: test_mm_loadr_ps:
1447; X64-AVX512:       # %bb.0:
1448; X64-AVX512-NEXT:    vpermilps $27, (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x07,0x1b]
1449; X64-AVX512-NEXT:    # xmm0 = mem[3,2,1,0]
1450; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1451  %arg0 = bitcast float* %a0 to <4 x float>*
1452  %ld = load <4 x float>, <4 x float>* %arg0, align 16
1453  %res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1454  ret <4 x float> %res
1455}
1456
1457define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind {
1458; X86-SSE-LABEL: test_mm_loadu_ps:
1459; X86-SSE:       # %bb.0:
1460; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1461; X86-SSE-NEXT:    movups (%eax), %xmm0 # encoding: [0x0f,0x10,0x00]
1462; X86-SSE-NEXT:    retl # encoding: [0xc3]
1463;
1464; X86-AVX1-LABEL: test_mm_loadu_ps:
1465; X86-AVX1:       # %bb.0:
1466; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1467; X86-AVX1-NEXT:    vmovups (%eax), %xmm0 # encoding: [0xc5,0xf8,0x10,0x00]
1468; X86-AVX1-NEXT:    retl # encoding: [0xc3]
1469;
1470; X86-AVX512-LABEL: test_mm_loadu_ps:
1471; X86-AVX512:       # %bb.0:
1472; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1473; X86-AVX512-NEXT:    vmovups (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x00]
1474; X86-AVX512-NEXT:    retl # encoding: [0xc3]
1475;
1476; X64-SSE-LABEL: test_mm_loadu_ps:
1477; X64-SSE:       # %bb.0:
1478; X64-SSE-NEXT:    movups (%rdi), %xmm0 # encoding: [0x0f,0x10,0x07]
1479; X64-SSE-NEXT:    retq # encoding: [0xc3]
1480;
1481; X64-AVX1-LABEL: test_mm_loadu_ps:
1482; X64-AVX1:       # %bb.0:
1483; X64-AVX1-NEXT:    vmovups (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x10,0x07]
1484; X64-AVX1-NEXT:    retq # encoding: [0xc3]
1485;
1486; X64-AVX512-LABEL: test_mm_loadu_ps:
1487; X64-AVX512:       # %bb.0:
1488; X64-AVX512-NEXT:    vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
1489; X64-AVX512-NEXT:    retq # encoding: [0xc3]
1490  %arg0 = bitcast float* %a0 to <4 x float>*
1491  %res = load <4 x float>, <4 x float>* %arg0, align 1
1492  ret <4 x float> %res
1493}
1494
1495define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) {
1496; SSE-LABEL: test_mm_max_ps:
1497; SSE:       # %bb.0:
1498; SSE-NEXT:    maxps %xmm1, %xmm0 # encoding: [0x0f,0x5f,0xc1]
1499; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1500;
1501; AVX1-LABEL: test_mm_max_ps:
1502; AVX1:       # %bb.0:
1503; AVX1-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5f,0xc1]
1504; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1505;
1506; AVX512-LABEL: test_mm_max_ps:
1507; AVX512:       # %bb.0:
1508; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
1509; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1510  %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
1511  ret <4 x float> %res
1512}
1513declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
1514
1515define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) {
1516; SSE-LABEL: test_mm_max_ss:
1517; SSE:       # %bb.0:
1518; SSE-NEXT:    maxss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5f,0xc1]
1519; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1520;
1521; AVX1-LABEL: test_mm_max_ss:
1522; AVX1:       # %bb.0:
1523; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5f,0xc1]
1524; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1525;
1526; AVX512-LABEL: test_mm_max_ss:
1527; AVX512:       # %bb.0:
1528; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5f,0xc1]
1529; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1530  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
1531  ret <4 x float> %res
1532}
1533declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
1534
1535define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) {
1536; SSE-LABEL: test_mm_min_ps:
1537; SSE:       # %bb.0:
1538; SSE-NEXT:    minps %xmm1, %xmm0 # encoding: [0x0f,0x5d,0xc1]
1539; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1540;
1541; AVX1-LABEL: test_mm_min_ps:
1542; AVX1:       # %bb.0:
1543; AVX1-NEXT:    vminps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5d,0xc1]
1544; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1545;
1546; AVX512-LABEL: test_mm_min_ps:
1547; AVX512:       # %bb.0:
1548; AVX512-NEXT:    vminps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1]
1549; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1550  %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
1551  ret <4 x float> %res
1552}
1553declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
1554
1555define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) {
1556; SSE-LABEL: test_mm_min_ss:
1557; SSE:       # %bb.0:
1558; SSE-NEXT:    minss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5d,0xc1]
1559; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1560;
1561; AVX1-LABEL: test_mm_min_ss:
1562; AVX1:       # %bb.0:
1563; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5d,0xc1]
1564; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1565;
1566; AVX512-LABEL: test_mm_min_ss:
1567; AVX512:       # %bb.0:
1568; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5d,0xc1]
1569; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1570  %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
1571  ret <4 x float> %res
1572}
1573declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
1574
1575define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
1576; SSE-LABEL: test_mm_move_ss:
1577; SSE:       # %bb.0:
1578; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
1579; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
1580; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1581;
1582; AVX-LABEL: test_mm_move_ss:
1583; AVX:       # %bb.0:
1584; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
1585; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
1586; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1587  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
1588  ret <4 x float> %res
1589}
1590
1591define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) {
1592; SSE-LABEL: test_mm_movehl_ps:
1593; SSE:       # %bb.0:
1594; SSE-NEXT:    movhlps %xmm1, %xmm0 # encoding: [0x0f,0x12,0xc1]
1595; SSE-NEXT:    # xmm0 = xmm1[1],xmm0[1]
1596; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1597;
1598; AVX1-LABEL: test_mm_movehl_ps:
1599; AVX1:       # %bb.0:
1600; AVX1-NEXT:    vunpckhpd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0x15,0xc0]
1601; AVX1-NEXT:    # xmm0 = xmm1[1],xmm0[1]
1602; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1603;
1604; AVX512-LABEL: test_mm_movehl_ps:
1605; AVX512:       # %bb.0:
1606; AVX512-NEXT:    vunpckhpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x15,0xc0]
1607; AVX512-NEXT:    # xmm0 = xmm1[1],xmm0[1]
1608; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1609  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
1610  ret <4 x float> %res
1611}
1612
1613define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) {
1614; SSE-LABEL: test_mm_movelh_ps:
1615; SSE:       # %bb.0:
1616; SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
1617; SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1618; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1619;
1620; AVX1-LABEL: test_mm_movelh_ps:
1621; AVX1:       # %bb.0:
1622; AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1]
1623; AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1624; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1625;
1626; AVX512-LABEL: test_mm_movelh_ps:
1627; AVX512:       # %bb.0:
1628; AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1]
1629; AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1630; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1631  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1632  ret <4 x float> %res
1633}
1634
1635define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind {
1636; SSE-LABEL: test_mm_movemask_ps:
1637; SSE:       # %bb.0:
1638; SSE-NEXT:    movmskps %xmm0, %eax # encoding: [0x0f,0x50,0xc0]
1639; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1640;
1641; AVX-LABEL: test_mm_movemask_ps:
1642; AVX:       # %bb.0:
1643; AVX-NEXT:    vmovmskps %xmm0, %eax # encoding: [0xc5,0xf8,0x50,0xc0]
1644; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1645  %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
1646  ret i32 %res
1647}
1648declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
1649
1650define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
1651; SSE-LABEL: test_mm_mul_ps:
1652; SSE:       # %bb.0:
1653; SSE-NEXT:    mulps %xmm1, %xmm0 # encoding: [0x0f,0x59,0xc1]
1654; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1655;
1656; AVX1-LABEL: test_mm_mul_ps:
1657; AVX1:       # %bb.0:
1658; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x59,0xc1]
1659; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1660;
1661; AVX512-LABEL: test_mm_mul_ps:
1662; AVX512:       # %bb.0:
1663; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x59,0xc1]
1664; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1665  %res = fmul <4 x float> %a0, %a1
1666  ret <4 x float> %res
1667}
1668
1669define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
1670; SSE-LABEL: test_mm_mul_ss:
1671; SSE:       # %bb.0:
1672; SSE-NEXT:    mulss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x59,0xc1]
1673; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1674;
1675; AVX1-LABEL: test_mm_mul_ss:
1676; AVX1:       # %bb.0:
1677; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x59,0xc1]
1678; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1679;
1680; AVX512-LABEL: test_mm_mul_ss:
1681; AVX512:       # %bb.0:
1682; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x59,0xc1]
1683; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1684  %ext0 = extractelement <4 x float> %a0, i32 0
1685  %ext1 = extractelement <4 x float> %a1, i32 0
1686  %fmul = fmul float %ext0, %ext1
1687  %res = insertelement <4 x float> %a0, float %fmul, i32 0
1688  ret <4 x float> %res
1689}
1690
1691define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
1692; SSE-LABEL: test_mm_or_ps:
1693; SSE:       # %bb.0:
1694; SSE-NEXT:    orps %xmm1, %xmm0 # encoding: [0x0f,0x56,0xc1]
1695; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1696;
1697; AVX1-LABEL: test_mm_or_ps:
1698; AVX1:       # %bb.0:
1699; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x56,0xc1]
1700; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1701;
1702; AVX512-LABEL: test_mm_or_ps:
1703; AVX512:       # %bb.0:
1704; AVX512-NEXT:    vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1]
1705; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1706  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
1707  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
1708  %res = or <4 x i32> %arg0, %arg1
1709  %bc = bitcast <4 x i32> %res to <4 x float>
1710  ret <4 x float> %bc
1711}
1712
1713define void @test_mm_prefetch(i8* %a0) {
1714; X86-LABEL: test_mm_prefetch:
1715; X86:       # %bb.0:
1716; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
1717; X86-NEXT:    prefetchnta (%eax) # encoding: [0x0f,0x18,0x00]
1718; X86-NEXT:    retl # encoding: [0xc3]
1719;
1720; X64-LABEL: test_mm_prefetch:
1721; X64:       # %bb.0:
1722; X64-NEXT:    prefetchnta (%rdi) # encoding: [0x0f,0x18,0x07]
1723; X64-NEXT:    retq # encoding: [0xc3]
1724  call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
1725  ret void
1726}
1727declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone
1728
1729define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) {
1730; SSE-LABEL: test_mm_rcp_ps:
1731; SSE:       # %bb.0:
1732; SSE-NEXT:    rcpps %xmm0, %xmm0 # encoding: [0x0f,0x53,0xc0]
1733; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1734;
1735; AVX-LABEL: test_mm_rcp_ps:
1736; AVX:       # %bb.0:
1737; AVX-NEXT:    vrcpps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x53,0xc0]
1738; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1739  %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
1740  ret <4 x float> %res
1741}
1742declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
1743
1744define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) {
1745; SSE-LABEL: test_mm_rcp_ss:
1746; SSE:       # %bb.0:
1747; SSE-NEXT:    rcpss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x53,0xc0]
1748; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1749;
1750; AVX-LABEL: test_mm_rcp_ss:
1751; AVX:       # %bb.0:
1752; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x53,0xc0]
1753; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1754  %rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
1755  ret <4 x float> %rcp
1756}
1757declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
1758
1759define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) {
1760; SSE-LABEL: test_mm_rsqrt_ps:
1761; SSE:       # %bb.0:
1762; SSE-NEXT:    rsqrtps %xmm0, %xmm0 # encoding: [0x0f,0x52,0xc0]
1763; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1764;
1765; AVX-LABEL: test_mm_rsqrt_ps:
1766; AVX:       # %bb.0:
1767; AVX-NEXT:    vrsqrtps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x52,0xc0]
1768; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1769  %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
1770  ret <4 x float> %res
1771}
1772declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
1773
1774define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) {
1775; SSE-LABEL: test_mm_rsqrt_ss:
1776; SSE:       # %bb.0:
1777; SSE-NEXT:    rsqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x52,0xc0]
1778; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1779;
1780; AVX-LABEL: test_mm_rsqrt_ss:
1781; AVX:       # %bb.0:
1782; AVX-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x52,0xc0]
1783; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
1784  %rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
1785  ret <4 x float> %rsqrt
1786}
1787declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
1788
1789define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
1790; X86-SSE-LABEL: test_MM_SET_EXCEPTION_MASK:
1791; X86-SSE:       # %bb.0:
1792; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1793; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1794; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1795; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
1796; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1797; X86-SSE-NEXT:    andl $-8065, %edx # encoding: [0x81,0xe2,0x7f,0xe0,0xff,0xff]
1798; X86-SSE-NEXT:    # imm = 0xE07F
1799; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1800; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1801; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
1802; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
1803; X86-SSE-NEXT:    retl # encoding: [0xc3]
1804;
1805; X86-AVX-LABEL: test_MM_SET_EXCEPTION_MASK:
1806; X86-AVX:       # %bb.0:
1807; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1808; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1809; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1810; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
1811; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1812; X86-AVX-NEXT:    andl $-8065, %edx # encoding: [0x81,0xe2,0x7f,0xe0,0xff,0xff]
1813; X86-AVX-NEXT:    # imm = 0xE07F
1814; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1815; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1816; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
1817; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
1818; X86-AVX-NEXT:    retl # encoding: [0xc3]
1819;
1820; X64-SSE-LABEL: test_MM_SET_EXCEPTION_MASK:
1821; X64-SSE:       # %bb.0:
1822; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1823; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1824; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1825; X64-SSE-NEXT:    andl $-8065, %ecx # encoding: [0x81,0xe1,0x7f,0xe0,0xff,0xff]
1826; X64-SSE-NEXT:    # imm = 0xE07F
1827; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1828; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1829; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
1830; X64-SSE-NEXT:    retq # encoding: [0xc3]
1831;
1832; X64-AVX-LABEL: test_MM_SET_EXCEPTION_MASK:
1833; X64-AVX:       # %bb.0:
1834; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1835; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1836; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1837; X64-AVX-NEXT:    andl $-8065, %ecx # encoding: [0x81,0xe1,0x7f,0xe0,0xff,0xff]
1838; X64-AVX-NEXT:    # imm = 0xE07F
1839; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1840; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1841; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
1842; X64-AVX-NEXT:    retq # encoding: [0xc3]
1843  %1 = alloca i32, align 4
1844  %2 = bitcast i32* %1 to i8*
1845  call void @llvm.x86.sse.stmxcsr(i8* %2)
1846  %3 = load i32, i32* %1
1847  %4 = and i32 %3, -8065
1848  %5 = or i32 %4, %a0
1849  store i32 %5, i32* %1
1850  call void @llvm.x86.sse.ldmxcsr(i8* %2)
1851  ret void
1852}
1853declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone
1854
1855define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
1856; X86-SSE-LABEL: test_MM_SET_EXCEPTION_STATE:
1857; X86-SSE:       # %bb.0:
1858; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1859; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1860; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1861; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
1862; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1863; X86-SSE-NEXT:    andl $-64, %edx # encoding: [0x83,0xe2,0xc0]
1864; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1865; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1866; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
1867; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
1868; X86-SSE-NEXT:    retl # encoding: [0xc3]
1869;
1870; X86-AVX-LABEL: test_MM_SET_EXCEPTION_STATE:
1871; X86-AVX:       # %bb.0:
1872; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1873; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1874; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1875; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
1876; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1877; X86-AVX-NEXT:    andl $-64, %edx # encoding: [0x83,0xe2,0xc0]
1878; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1879; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1880; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
1881; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
1882; X86-AVX-NEXT:    retl # encoding: [0xc3]
1883;
1884; X64-SSE-LABEL: test_MM_SET_EXCEPTION_STATE:
1885; X64-SSE:       # %bb.0:
1886; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1887; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1888; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1889; X64-SSE-NEXT:    andl $-64, %ecx # encoding: [0x83,0xe1,0xc0]
1890; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1891; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1892; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
1893; X64-SSE-NEXT:    retq # encoding: [0xc3]
1894;
1895; X64-AVX-LABEL: test_MM_SET_EXCEPTION_STATE:
1896; X64-AVX:       # %bb.0:
1897; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1898; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1899; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1900; X64-AVX-NEXT:    andl $-64, %ecx # encoding: [0x83,0xe1,0xc0]
1901; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1902; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1903; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
1904; X64-AVX-NEXT:    retq # encoding: [0xc3]
1905  %1 = alloca i32, align 4
1906  %2 = bitcast i32* %1 to i8*
1907  call void @llvm.x86.sse.stmxcsr(i8* %2)
1908  %3 = load i32, i32* %1
1909  %4 = and i32 %3, -64
1910  %5 = or i32 %4, %a0
1911  store i32 %5, i32* %1
1912  call void @llvm.x86.sse.ldmxcsr(i8* %2)
1913  ret void
1914}
1915
1916define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
1917; X86-SSE-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1918; X86-SSE:       # %bb.0:
1919; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
1920; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1921; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1922; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
1923; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1924; X86-SSE-NEXT:    andl $-32769, %edx # encoding: [0x81,0xe2,0xff,0x7f,0xff,0xff]
1925; X86-SSE-NEXT:    # imm = 0xFFFF7FFF
1926; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1927; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1928; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
1929; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
1930; X86-SSE-NEXT:    retl # encoding: [0xc3]
1931;
1932; X86-AVX-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1933; X86-AVX:       # %bb.0:
1934; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
1935; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
1936; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
1937; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
1938; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
1939; X86-AVX-NEXT:    andl $-32769, %edx # encoding: [0x81,0xe2,0xff,0x7f,0xff,0xff]
1940; X86-AVX-NEXT:    # imm = 0xFFFF7FFF
1941; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
1942; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
1943; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
1944; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
1945; X86-AVX-NEXT:    retl # encoding: [0xc3]
1946;
1947; X64-SSE-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1948; X64-SSE:       # %bb.0:
1949; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1950; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
1951; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1952; X64-SSE-NEXT:    andl $-32769, %ecx # encoding: [0x81,0xe1,0xff,0x7f,0xff,0xff]
1953; X64-SSE-NEXT:    # imm = 0xFFFF7FFF
1954; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1955; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1956; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
1957; X64-SSE-NEXT:    retq # encoding: [0xc3]
1958;
1959; X64-AVX-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
1960; X64-AVX:       # %bb.0:
1961; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
1962; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
1963; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
1964; X64-AVX-NEXT:    andl $-32769, %ecx # encoding: [0x81,0xe1,0xff,0x7f,0xff,0xff]
1965; X64-AVX-NEXT:    # imm = 0xFFFF7FFF
1966; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
1967; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
1968; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
1969; X64-AVX-NEXT:    retq # encoding: [0xc3]
1970  %1 = alloca i32, align 4
1971  %2 = bitcast i32* %1 to i8*
1972  call void @llvm.x86.sse.stmxcsr(i8* %2)
1973  %3 = load i32, i32* %1
1974  %4 = and i32 %3, -32769
1975  %5 = or i32 %4, %a0
1976  store i32 %5, i32* %1
1977  call void @llvm.x86.sse.ldmxcsr(i8* %2)
1978  ret void
1979}
1980
1981define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
1982; X86-SSE-LABEL: test_mm_set_ps:
1983; X86-SSE:       # %bb.0:
1984; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
1985; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
1986; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
1987; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
1988; X86-SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
1989; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1990; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x08]
1991; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
1992; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x04]
1993; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
1994; X86-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
1995; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1996; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
1997; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
1998; X86-SSE-NEXT:    retl # encoding: [0xc3]
1999;
2000; X86-AVX1-LABEL: test_mm_set_ps:
2001; X86-AVX1:       # %bb.0:
2002; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2003; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2004; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2005; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2006; X86-AVX1-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2007; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2008; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
2009; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2010; X86-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20]
2011; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2012; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
2013; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2014; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
2015; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[0]
2016; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2017;
2018; X86-AVX512-LABEL: test_mm_set_ps:
2019; X86-AVX512:       # %bb.0:
2020; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2021; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2022; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2023; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
2024; X86-AVX512-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2025; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2026; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
2027; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
2028; X86-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20]
2029; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
2030; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
2031; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
2032; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
2033; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[0]
2034; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2035;
2036; X64-SSE-LABEL: test_mm_set_ps:
2037; X64-SSE:       # %bb.0:
2038; X64-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
2039; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2040; X64-SSE-NEXT:    unpcklps %xmm2, %xmm3 # encoding: [0x0f,0x14,0xda]
2041; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2042; X64-SSE-NEXT:    movlhps %xmm1, %xmm3 # encoding: [0x0f,0x16,0xd9]
2043; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm1[0]
2044; X64-SSE-NEXT:    movaps %xmm3, %xmm0 # encoding: [0x0f,0x28,0xc3]
2045; X64-SSE-NEXT:    retq # encoding: [0xc3]
2046;
2047; X64-AVX1-LABEL: test_mm_set_ps:
2048; X64-AVX1:       # %bb.0:
2049; X64-AVX1-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2050; X64-AVX1-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2051; X64-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2052; X64-AVX1-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2053; X64-AVX1-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2054; X64-AVX1-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2055; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2056;
2057; X64-AVX512-LABEL: test_mm_set_ps:
2058; X64-AVX512:       # %bb.0:
2059; X64-AVX512-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2060; X64-AVX512-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2061; X64-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2062; X64-AVX512-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2063; X64-AVX512-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2064; X64-AVX512-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2065; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2066  %res0  = insertelement <4 x float> undef, float %a3, i32 0
2067  %res1  = insertelement <4 x float> %res0, float %a2, i32 1
2068  %res2  = insertelement <4 x float> %res1, float %a1, i32 2
2069  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
2070  ret <4 x float> %res3
2071}
2072
2073define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
2074; X86-SSE-LABEL: test_mm_set_ps1:
2075; X86-SSE:       # %bb.0:
2076; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
2077; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2078; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2079; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2080; X86-SSE-NEXT:    retl # encoding: [0xc3]
2081;
2082; X86-AVX1-LABEL: test_mm_set_ps1:
2083; X86-AVX1:       # %bb.0:
2084; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2085; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2086; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2087; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2088; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2089;
2090; X86-AVX512-LABEL: test_mm_set_ps1:
2091; X86-AVX512:       # %bb.0:
2092; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2093; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2094; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2095; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2096;
2097; X64-SSE-LABEL: test_mm_set_ps1:
2098; X64-SSE:       # %bb.0:
2099; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2100; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2101; X64-SSE-NEXT:    retq # encoding: [0xc3]
2102;
2103; X64-AVX1-LABEL: test_mm_set_ps1:
2104; X64-AVX1:       # %bb.0:
2105; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2106; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2107; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2108;
2109; X64-AVX512-LABEL: test_mm_set_ps1:
2110; X64-AVX512:       # %bb.0:
2111; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2112; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2113  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2114  %res1  = insertelement <4 x float> %res0, float %a0, i32 1
2115  %res2  = insertelement <4 x float> %res1, float %a0, i32 2
2116  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
2117  ret <4 x float> %res3
2118}
2119
2120define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
2121; X86-SSE-LABEL: test_MM_SET_ROUNDING_MODE:
2122; X86-SSE:       # %bb.0:
2123; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
2124; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
2125; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
2126; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
2127; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
2128; X86-SSE-NEXT:    andl $-24577, %edx # encoding: [0x81,0xe2,0xff,0x9f,0xff,0xff]
2129; X86-SSE-NEXT:    # imm = 0x9FFF
2130; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
2131; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
2132; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
2133; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
2134; X86-SSE-NEXT:    retl # encoding: [0xc3]
2135;
2136; X86-AVX-LABEL: test_MM_SET_ROUNDING_MODE:
2137; X86-AVX:       # %bb.0:
2138; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
2139; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
2140; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
2141; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
2142; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
2143; X86-AVX-NEXT:    andl $-24577, %edx # encoding: [0x81,0xe2,0xff,0x9f,0xff,0xff]
2144; X86-AVX-NEXT:    # imm = 0x9FFF
2145; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
2146; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
2147; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
2148; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
2149; X86-AVX-NEXT:    retl # encoding: [0xc3]
2150;
2151; X64-SSE-LABEL: test_MM_SET_ROUNDING_MODE:
2152; X64-SSE:       # %bb.0:
2153; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2154; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
2155; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
2156; X64-SSE-NEXT:    andl $-24577, %ecx # encoding: [0x81,0xe1,0xff,0x9f,0xff,0xff]
2157; X64-SSE-NEXT:    # imm = 0x9FFF
2158; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
2159; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
2160; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
2161; X64-SSE-NEXT:    retq # encoding: [0xc3]
2162;
2163; X64-AVX-LABEL: test_MM_SET_ROUNDING_MODE:
2164; X64-AVX:       # %bb.0:
2165; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2166; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
2167; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
2168; X64-AVX-NEXT:    andl $-24577, %ecx # encoding: [0x81,0xe1,0xff,0x9f,0xff,0xff]
2169; X64-AVX-NEXT:    # imm = 0x9FFF
2170; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
2171; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
2172; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
2173; X64-AVX-NEXT:    retq # encoding: [0xc3]
2174  %1 = alloca i32, align 4
2175  %2 = bitcast i32* %1 to i8*
2176  call void @llvm.x86.sse.stmxcsr(i8* %2)
2177  %3 = load i32, i32* %1
2178  %4 = and i32 %3, -24577
2179  %5 = or i32 %4, %a0
2180  store i32 %5, i32* %1
2181  call void @llvm.x86.sse.ldmxcsr(i8* %2)
2182  ret void
2183}
2184
2185define <4 x float> @test_mm_set_ss(float %a0) nounwind {
2186; X86-SSE-LABEL: test_mm_set_ss:
2187; X86-SSE:       # %bb.0:
2188; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
2189; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
2190; X86-SSE-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
2191; X86-SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
2192; X86-SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
2193; X86-SSE-NEXT:    retl # encoding: [0xc3]
2194;
2195; X86-AVX1-LABEL: test_mm_set_ss:
2196; X86-AVX1:       # %bb.0:
2197; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2198; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2199; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
2200; X86-AVX1-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
2201; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
2202; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2203;
2204; X86-AVX512-LABEL: test_mm_set_ss:
2205; X86-AVX512:       # %bb.0:
2206; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2207; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2208; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
2209; X86-AVX512-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
2210; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
2211; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2212;
2213; X64-SSE-LABEL: test_mm_set_ss:
2214; X64-SSE:       # %bb.0:
2215; X64-SSE-NEXT:    xorps %xmm1, %xmm1 # encoding: [0x0f,0x57,0xc9]
2216; X64-SSE-NEXT:    movss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0x10,0xc8]
2217; X64-SSE-NEXT:    # xmm1 = xmm0[0],xmm1[1,2,3]
2218; X64-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
2219; X64-SSE-NEXT:    retq # encoding: [0xc3]
2220;
2221; X64-AVX-LABEL: test_mm_set_ss:
2222; X64-AVX:       # %bb.0:
2223; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
2224; X64-AVX-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
2225; X64-AVX-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
2226; X64-AVX-NEXT:    retq # encoding: [0xc3]
2227  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2228  %res1  = insertelement <4 x float> %res0, float 0.0, i32 1
2229  %res2  = insertelement <4 x float> %res1, float 0.0, i32 2
2230  %res3  = insertelement <4 x float> %res2, float 0.0, i32 3
2231  ret <4 x float> %res3
2232}
2233
2234define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
2235; X86-SSE-LABEL: test_mm_set1_ps:
2236; X86-SSE:       # %bb.0:
2237; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
2238; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2239; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2240; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2241; X86-SSE-NEXT:    retl # encoding: [0xc3]
2242;
2243; X86-AVX1-LABEL: test_mm_set1_ps:
2244; X86-AVX1:       # %bb.0:
2245; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2246; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2247; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2248; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2249; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2250;
2251; X86-AVX512-LABEL: test_mm_set1_ps:
2252; X86-AVX512:       # %bb.0:
2253; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
2254; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2255; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2256; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2257;
2258; X64-SSE-LABEL: test_mm_set1_ps:
2259; X64-SSE:       # %bb.0:
2260; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2261; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2262; X64-SSE-NEXT:    retq # encoding: [0xc3]
2263;
2264; X64-AVX1-LABEL: test_mm_set1_ps:
2265; X64-AVX1:       # %bb.0:
2266; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2267; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2268; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2269;
2270; X64-AVX512-LABEL: test_mm_set1_ps:
2271; X64-AVX512:       # %bb.0:
2272; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2273; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2274  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2275  %res1  = insertelement <4 x float> %res0, float %a0, i32 1
2276  %res2  = insertelement <4 x float> %res1, float %a0, i32 2
2277  %res3  = insertelement <4 x float> %res2, float %a0, i32 3
2278  ret <4 x float> %res3
2279}
2280
2281define void @test_mm_setcsr(i32 %a0) nounwind {
2282; X86-SSE-LABEL: test_mm_setcsr:
2283; X86-SSE:       # %bb.0:
2284; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax # encoding: [0x8d,0x44,0x24,0x04]
2285; X86-SSE-NEXT:    ldmxcsr (%eax) # encoding: [0x0f,0xae,0x10]
2286; X86-SSE-NEXT:    retl # encoding: [0xc3]
2287;
2288; X86-AVX-LABEL: test_mm_setcsr:
2289; X86-AVX:       # %bb.0:
2290; X86-AVX-NEXT:    leal {{[0-9]+}}(%esp), %eax # encoding: [0x8d,0x44,0x24,0x04]
2291; X86-AVX-NEXT:    vldmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x10]
2292; X86-AVX-NEXT:    retl # encoding: [0xc3]
2293;
2294; X64-SSE-LABEL: test_mm_setcsr:
2295; X64-SSE:       # %bb.0:
2296; X64-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x7c,0x24,0xfc]
2297; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2298; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
2299; X64-SSE-NEXT:    retq # encoding: [0xc3]
2300;
2301; X64-AVX-LABEL: test_mm_setcsr:
2302; X64-AVX:       # %bb.0:
2303; X64-AVX-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x7c,0x24,0xfc]
2304; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
2305; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
2306; X64-AVX-NEXT:    retq # encoding: [0xc3]
2307  %st = alloca i32, align 4
2308  store i32 %a0, i32* %st, align 4
2309  %bc = bitcast i32* %st to i8*
2310  call void @llvm.x86.sse.ldmxcsr(i8* %bc)
2311  ret void
2312}
2313
2314define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
2315; X86-SSE-LABEL: test_mm_setr_ps:
2316; X86-SSE:       # %bb.0:
2317; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
2318; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2319; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
2320; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
2321; X86-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
2322; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2323; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08]
2324; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
2325; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
2326; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2327; X86-SSE-NEXT:    unpcklps %xmm2, %xmm0 # encoding: [0x0f,0x14,0xc2]
2328; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
2329; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
2330; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
2331; X86-SSE-NEXT:    retl # encoding: [0xc3]
2332;
2333; X86-AVX1-LABEL: test_mm_setr_ps:
2334; X86-AVX1:       # %bb.0:
2335; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2336; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2337; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2338; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
2339; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
2340; X86-AVX1-NEXT:    # xmm2 = mem[0],zero,zero,zero
2341; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
2342; X86-AVX1-NEXT:    # xmm3 = mem[0],zero,zero,zero
2343; X86-AVX1-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2344; X86-AVX1-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2345; X86-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2346; X86-AVX1-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2347; X86-AVX1-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2348; X86-AVX1-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2349; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2350;
2351; X86-AVX512-LABEL: test_mm_setr_ps:
2352; X86-AVX512:       # %bb.0:
2353; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
2354; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2355; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
2356; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
2357; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
2358; X86-AVX512-NEXT:    # xmm2 = mem[0],zero,zero,zero
2359; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
2360; X86-AVX512-NEXT:    # xmm3 = mem[0],zero,zero,zero
2361; X86-AVX512-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
2362; X86-AVX512-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2363; X86-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
2364; X86-AVX512-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2365; X86-AVX512-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
2366; X86-AVX512-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
2367; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2368;
2369; X64-SSE-LABEL: test_mm_setr_ps:
2370; X64-SSE:       # %bb.0:
2371; X64-SSE-NEXT:    unpcklps %xmm3, %xmm2 # encoding: [0x0f,0x14,0xd3]
2372; X64-SSE-NEXT:    # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2373; X64-SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
2374; X64-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2375; X64-SSE-NEXT:    movlhps %xmm2, %xmm0 # encoding: [0x0f,0x16,0xc2]
2376; X64-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0]
2377; X64-SSE-NEXT:    retq # encoding: [0xc3]
2378;
2379; X64-AVX1-LABEL: test_mm_setr_ps:
2380; X64-AVX1:       # %bb.0:
2381; X64-AVX1-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2382; X64-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2383; X64-AVX1-NEXT:    vinsertps $32, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
2384; X64-AVX1-NEXT:    # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2385; X64-AVX1-NEXT:    vinsertps $48, %xmm3, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
2386; X64-AVX1-NEXT:    # xmm0 = xmm0[0,1,2],xmm3[0]
2387; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2388;
2389; X64-AVX512-LABEL: test_mm_setr_ps:
2390; X64-AVX512:       # %bb.0:
2391; X64-AVX512-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
2392; X64-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2393; X64-AVX512-NEXT:    vinsertps $32, %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
2394; X64-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2395; X64-AVX512-NEXT:    vinsertps $48, %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
2396; X64-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm3[0]
2397; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2398  %res0  = insertelement <4 x float> undef, float %a0, i32 0
2399  %res1  = insertelement <4 x float> %res0, float %a1, i32 1
2400  %res2  = insertelement <4 x float> %res1, float %a2, i32 2
2401  %res3  = insertelement <4 x float> %res2, float %a3, i32 3
2402  ret <4 x float> %res3
2403}
2404
2405define <4 x float> @test_mm_setzero_ps() {
2406; SSE-LABEL: test_mm_setzero_ps:
2407; SSE:       # %bb.0:
2408; SSE-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
2409; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2410;
2411; AVX1-LABEL: test_mm_setzero_ps:
2412; AVX1:       # %bb.0:
2413; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
2414; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2415;
2416; AVX512-LABEL: test_mm_setzero_ps:
2417; AVX512:       # %bb.0:
2418; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc0]
2419; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2420  ret <4 x float> zeroinitializer
2421}
2422
2423define void @test_mm_sfence() nounwind {
2424; CHECK-LABEL: test_mm_sfence:
2425; CHECK:       # %bb.0:
2426; CHECK-NEXT:    sfence # encoding: [0x0f,0xae,0xf8]
2427; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2428  call void @llvm.x86.sse.sfence()
2429  ret void
2430}
2431declare void @llvm.x86.sse.sfence() nounwind readnone
2432
2433define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2434; SSE-LABEL: test_mm_shuffle_ps:
2435; SSE:       # %bb.0:
2436; SSE-NEXT:    shufps $0, %xmm1, %xmm0 # encoding: [0x0f,0xc6,0xc1,0x00]
2437; SSE-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
2438; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2439;
2440; AVX1-LABEL: test_mm_shuffle_ps:
2441; AVX1:       # %bb.0:
2442; AVX1-NEXT:    vshufps $0, %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc1,0x00]
2443; AVX1-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
2444; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2445;
2446; AVX512-LABEL: test_mm_shuffle_ps:
2447; AVX512:       # %bb.0:
2448; AVX512-NEXT:    vshufps $0, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc1,0x00]
2449; AVX512-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
2450; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2451  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
2452  ret <4 x float> %res
2453}
2454
2455define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {
2456; SSE-LABEL: test_mm_sqrt_ps:
2457; SSE:       # %bb.0:
2458; SSE-NEXT:    sqrtps %xmm0, %xmm0 # encoding: [0x0f,0x51,0xc0]
2459; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2460;
2461; AVX1-LABEL: test_mm_sqrt_ps:
2462; AVX1:       # %bb.0:
2463; AVX1-NEXT:    vsqrtps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x51,0xc0]
2464; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2465;
2466; AVX512-LABEL: test_mm_sqrt_ps:
2467; AVX512:       # %bb.0:
2468; AVX512-NEXT:    vsqrtps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0]
2469; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2470  %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
2471  ret <4 x float> %res
2472}
2473declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind readnone
2474
2475define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
2476; SSE-LABEL: test_mm_sqrt_ss:
2477; SSE:       # %bb.0:
2478; SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
2479; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2480;
2481; AVX1-LABEL: test_mm_sqrt_ss:
2482; AVX1:       # %bb.0:
2483; AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
2484; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2485;
2486; AVX512-LABEL: test_mm_sqrt_ss:
2487; AVX512:       # %bb.0:
2488; AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
2489; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
2490  %ext = extractelement <4 x float> %a0, i32 0
2491  %sqrt = call float @llvm.sqrt.f32(float %ext)
2492  %ins = insertelement <4 x float> %a0, float %sqrt, i32 0
2493  ret <4 x float> %ins
2494}
2495declare float @llvm.sqrt.f32(float) nounwind readnone
2496
2497define float @test_mm_sqrt_ss_scalar(float %a0) {
2498; X86-SSE-LABEL: test_mm_sqrt_ss_scalar:
2499; X86-SSE:       # %bb.0:
2500; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
2501; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
2502; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x08]
2503; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
2504; X86-SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
2505; X86-SSE-NEXT:    movss %xmm0, (%esp) # encoding: [0xf3,0x0f,0x11,0x04,0x24]
2506; X86-SSE-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
2507; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
2508; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
2509; X86-SSE-NEXT:    retl # encoding: [0xc3]
2510;
2511; X86-AVX1-LABEL: test_mm_sqrt_ss_scalar:
2512; X86-AVX1:       # %bb.0:
2513; X86-AVX1-NEXT:    pushl %eax # encoding: [0x50]
2514; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
2515; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
2516; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
2517; X86-AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
2518; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) # encoding: [0xc5,0xfa,0x11,0x04,0x24]
2519; X86-AVX1-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
2520; X86-AVX1-NEXT:    popl %eax # encoding: [0x58]
2521; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
2522; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2523;
2524; X86-AVX512-LABEL: test_mm_sqrt_ss_scalar:
2525; X86-AVX512:       # %bb.0:
2526; X86-AVX512-NEXT:    pushl %eax # encoding: [0x50]
2527; X86-AVX512-NEXT:    .cfi_def_cfa_offset 8
2528; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
2529; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
2530; X86-AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
2531; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
2532; X86-AVX512-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
2533; X86-AVX512-NEXT:    popl %eax # encoding: [0x58]
2534; X86-AVX512-NEXT:    .cfi_def_cfa_offset 4
2535; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2536;
2537; X64-SSE-LABEL: test_mm_sqrt_ss_scalar:
2538; X64-SSE:       # %bb.0:
2539; X64-SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
2540; X64-SSE-NEXT:    retq # encoding: [0xc3]
2541;
2542; X64-AVX1-LABEL: test_mm_sqrt_ss_scalar:
2543; X64-AVX1:       # %bb.0:
2544; X64-AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
2545; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2546;
2547; X64-AVX512-LABEL: test_mm_sqrt_ss_scalar:
2548; X64-AVX512:       # %bb.0:
2549; X64-AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
2550; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2551  %sqrt = call float @llvm.sqrt.f32(float %a0)
2552  ret float %sqrt
2553}
2554
2555define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
2556; X86-SSE-LABEL: test_mm_store_ps:
2557; X86-SSE:       # %bb.0:
2558; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2559; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2560; X86-SSE-NEXT:    retl # encoding: [0xc3]
2561;
2562; X86-AVX1-LABEL: test_mm_store_ps:
2563; X86-AVX1:       # %bb.0:
2564; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2565; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2566; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2567;
2568; X86-AVX512-LABEL: test_mm_store_ps:
2569; X86-AVX512:       # %bb.0:
2570; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2571; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2572; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2573;
2574; X64-SSE-LABEL: test_mm_store_ps:
2575; X64-SSE:       # %bb.0:
2576; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2577; X64-SSE-NEXT:    retq # encoding: [0xc3]
2578;
2579; X64-AVX1-LABEL: test_mm_store_ps:
2580; X64-AVX1:       # %bb.0:
2581; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2582; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2583;
2584; X64-AVX512-LABEL: test_mm_store_ps:
2585; X64-AVX512:       # %bb.0:
2586; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2587; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2588  %arg0 = bitcast float* %a0 to <4 x float>*
2589  store <4 x float> %a1, <4 x float>* %arg0, align 16
2590  ret void
2591}
2592
2593define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
2594; X86-SSE-LABEL: test_mm_store_ps1:
2595; X86-SSE:       # %bb.0:
2596; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2597; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2598; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2599; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2600; X86-SSE-NEXT:    retl # encoding: [0xc3]
2601;
2602; X86-AVX1-LABEL: test_mm_store_ps1:
2603; X86-AVX1:       # %bb.0:
2604; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2605; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2606; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2607; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2608; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2609;
2610; X86-AVX512-LABEL: test_mm_store_ps1:
2611; X86-AVX512:       # %bb.0:
2612; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2613; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2614; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2615; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2616;
2617; X64-SSE-LABEL: test_mm_store_ps1:
2618; X64-SSE:       # %bb.0:
2619; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2620; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2621; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2622; X64-SSE-NEXT:    retq # encoding: [0xc3]
2623;
2624; X64-AVX1-LABEL: test_mm_store_ps1:
2625; X64-AVX1:       # %bb.0:
2626; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2627; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2628; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2629; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2630;
2631; X64-AVX512-LABEL: test_mm_store_ps1:
2632; X64-AVX512:       # %bb.0:
2633; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2634; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2635; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2636  %arg0 = bitcast float* %a0 to <4 x float>*
2637  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
2638  store <4 x float> %shuf, <4 x float>* %arg0, align 16
2639  ret void
2640}
2641
2642define void @test_mm_store_ss(float *%a0, <4 x float> %a1) {
2643; X86-SSE-LABEL: test_mm_store_ss:
2644; X86-SSE:       # %bb.0:
2645; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2646; X86-SSE-NEXT:    movss %xmm0, (%eax) # encoding: [0xf3,0x0f,0x11,0x00]
2647; X86-SSE-NEXT:    retl # encoding: [0xc3]
2648;
2649; X86-AVX1-LABEL: test_mm_store_ss:
2650; X86-AVX1:       # %bb.0:
2651; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2652; X86-AVX1-NEXT:    vmovss %xmm0, (%eax) # encoding: [0xc5,0xfa,0x11,0x00]
2653; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2654;
2655; X86-AVX512-LABEL: test_mm_store_ss:
2656; X86-AVX512:       # %bb.0:
2657; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2658; X86-AVX512-NEXT:    vmovss %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x00]
2659; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2660;
2661; X64-SSE-LABEL: test_mm_store_ss:
2662; X64-SSE:       # %bb.0:
2663; X64-SSE-NEXT:    movss %xmm0, (%rdi) # encoding: [0xf3,0x0f,0x11,0x07]
2664; X64-SSE-NEXT:    retq # encoding: [0xc3]
2665;
2666; X64-AVX1-LABEL: test_mm_store_ss:
2667; X64-AVX1:       # %bb.0:
2668; X64-AVX1-NEXT:    vmovss %xmm0, (%rdi) # encoding: [0xc5,0xfa,0x11,0x07]
2669; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2670;
2671; X64-AVX512-LABEL: test_mm_store_ss:
2672; X64-AVX512:       # %bb.0:
2673; X64-AVX512-NEXT:    vmovss %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07]
2674; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2675  %ext = extractelement <4 x float> %a1, i32 0
2676  store float %ext, float* %a0, align 1
2677  ret void
2678}
2679
2680define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
2681; X86-SSE-LABEL: test_mm_store1_ps:
2682; X86-SSE:       # %bb.0:
2683; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2684; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2685; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2686; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2687; X86-SSE-NEXT:    retl # encoding: [0xc3]
2688;
2689; X86-AVX1-LABEL: test_mm_store1_ps:
2690; X86-AVX1:       # %bb.0:
2691; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2692; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2693; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2694; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2695; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2696;
2697; X86-AVX512-LABEL: test_mm_store1_ps:
2698; X86-AVX512:       # %bb.0:
2699; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2700; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2701; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2702; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2703;
2704; X64-SSE-LABEL: test_mm_store1_ps:
2705; X64-SSE:       # %bb.0:
2706; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
2707; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
2708; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2709; X64-SSE-NEXT:    retq # encoding: [0xc3]
2710;
2711; X64-AVX1-LABEL: test_mm_store1_ps:
2712; X64-AVX1:       # %bb.0:
2713; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
2714; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
2715; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2716; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2717;
2718; X64-AVX512-LABEL: test_mm_store1_ps:
2719; X64-AVX512:       # %bb.0:
2720; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
2721; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2722; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2723  %arg0 = bitcast float* %a0 to <4 x float>*
2724  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
2725  store <4 x float> %shuf, <4 x float>* %arg0, align 16
2726  ret void
2727}
2728
2729define void @test_mm_storeh_pi(x86_mmx *%a0, <4 x float> %a1) nounwind {
2730; X86-SSE-LABEL: test_mm_storeh_pi:
2731; X86-SSE:       # %bb.0:
2732; X86-SSE-NEXT:    pushl %ebp # encoding: [0x55]
2733; X86-SSE-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
2734; X86-SSE-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
2735; X86-SSE-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
2736; X86-SSE-NEXT:    movl 8(%ebp), %eax # encoding: [0x8b,0x45,0x08]
2737; X86-SSE-NEXT:    movaps %xmm0, (%esp) # encoding: [0x0f,0x29,0x04,0x24]
2738; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
2739; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
2740; X86-SSE-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
2741; X86-SSE-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
2742; X86-SSE-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
2743; X86-SSE-NEXT:    popl %ebp # encoding: [0x5d]
2744; X86-SSE-NEXT:    retl # encoding: [0xc3]
2745;
2746; X86-AVX1-LABEL: test_mm_storeh_pi:
2747; X86-AVX1:       # %bb.0:
2748; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2749; X86-AVX1-NEXT:    vmovhps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x17,0x00]
2750; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2751;
2752; X86-AVX512-LABEL: test_mm_storeh_pi:
2753; X86-AVX512:       # %bb.0:
2754; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2755; X86-AVX512-NEXT:    vmovhps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x17,0x00]
2756; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2757;
2758; X64-SSE-LABEL: test_mm_storeh_pi:
2759; X64-SSE:       # %bb.0:
2760; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
2761; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xf0]
2762; X64-SSE-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2763; X64-SSE-NEXT:    retq # encoding: [0xc3]
2764;
2765; X64-AVX1-LABEL: test_mm_storeh_pi:
2766; X64-AVX1:       # %bb.0:
2767; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax # encoding: [0xc4,0xe3,0xf9,0x16,0xc0,0x01]
2768; X64-AVX1-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2769; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2770;
2771; X64-AVX512-LABEL: test_mm_storeh_pi:
2772; X64-AVX512:       # %bb.0:
2773; X64-AVX512-NEXT:    vpextrq $1, %xmm0, %rax # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0x16,0xc0,0x01]
2774; X64-AVX512-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2775; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2776  %ptr = bitcast x86_mmx* %a0 to i64*
2777  %bc  = bitcast <4 x float> %a1 to <2 x i64>
2778  %ext = extractelement <2 x i64> %bc, i32 1
2779  store i64 %ext, i64* %ptr
2780  ret void
2781}
2782
2783define void @test_mm_storeh_pi2(x86_mmx *%a0, <4 x float> %a1) nounwind {
2784; X86-SSE-LABEL: test_mm_storeh_pi2:
2785; X86-SSE:       # %bb.0:
2786; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2787; X86-SSE-NEXT:    movhps %xmm0, (%eax) # encoding: [0x0f,0x17,0x00]
2788; X86-SSE-NEXT:    retl # encoding: [0xc3]
2789;
2790; X86-AVX1-LABEL: test_mm_storeh_pi2:
2791; X86-AVX1:       # %bb.0:
2792; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2793; X86-AVX1-NEXT:    vmovhps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x17,0x00]
2794; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2795;
2796; X86-AVX512-LABEL: test_mm_storeh_pi2:
2797; X86-AVX512:       # %bb.0:
2798; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2799; X86-AVX512-NEXT:    vmovhps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x17,0x00]
2800; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2801;
2802; X64-SSE-LABEL: test_mm_storeh_pi2:
2803; X64-SSE:       # %bb.0:
2804; X64-SSE-NEXT:    movhps %xmm0, (%rdi) # encoding: [0x0f,0x17,0x07]
2805; X64-SSE-NEXT:    retq # encoding: [0xc3]
2806;
2807; X64-AVX1-LABEL: test_mm_storeh_pi2:
2808; X64-AVX1:       # %bb.0:
2809; X64-AVX1-NEXT:    vmovhps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x17,0x07]
2810; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2811;
2812; X64-AVX512-LABEL: test_mm_storeh_pi2:
2813; X64-AVX512:       # %bb.0:
2814; X64-AVX512-NEXT:    vmovhps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x17,0x07]
2815; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2816  %ptr = bitcast x86_mmx* %a0 to <2 x float>*
2817  %ext = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> <i32 2, i32 3>
2818  store <2 x float> %ext, <2 x float>* %ptr
2819  ret void
2820}
2821
2822define void @test_mm_storel_pi(x86_mmx *%a0, <4 x float> %a1) nounwind {
2823; X86-SSE-LABEL: test_mm_storel_pi:
2824; X86-SSE:       # %bb.0:
2825; X86-SSE-NEXT:    pushl %ebp # encoding: [0x55]
2826; X86-SSE-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
2827; X86-SSE-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
2828; X86-SSE-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
2829; X86-SSE-NEXT:    movl 8(%ebp), %eax # encoding: [0x8b,0x45,0x08]
2830; X86-SSE-NEXT:    movaps %xmm0, (%esp) # encoding: [0x0f,0x29,0x04,0x24]
2831; X86-SSE-NEXT:    movl (%esp), %ecx # encoding: [0x8b,0x0c,0x24]
2832; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
2833; X86-SSE-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
2834; X86-SSE-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
2835; X86-SSE-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
2836; X86-SSE-NEXT:    popl %ebp # encoding: [0x5d]
2837; X86-SSE-NEXT:    retl # encoding: [0xc3]
2838;
2839; X86-AVX1-LABEL: test_mm_storel_pi:
2840; X86-AVX1:       # %bb.0:
2841; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2842; X86-AVX1-NEXT:    vmovlps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x13,0x00]
2843; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2844;
2845; X86-AVX512-LABEL: test_mm_storel_pi:
2846; X86-AVX512:       # %bb.0:
2847; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2848; X86-AVX512-NEXT:    vmovlps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00]
2849; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2850;
2851; X64-SSE-LABEL: test_mm_storel_pi:
2852; X64-SSE:       # %bb.0:
2853; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
2854; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xe8]
2855; X64-SSE-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2856; X64-SSE-NEXT:    retq # encoding: [0xc3]
2857;
2858; X64-AVX1-LABEL: test_mm_storel_pi:
2859; X64-AVX1:       # %bb.0:
2860; X64-AVX1-NEXT:    vmovq %xmm0, %rax # encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
2861; X64-AVX1-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2862; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2863;
2864; X64-AVX512-LABEL: test_mm_storel_pi:
2865; X64-AVX512:       # %bb.0:
2866; X64-AVX512-NEXT:    vmovq %xmm0, %rax # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
2867; X64-AVX512-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
2868; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2869  %ptr = bitcast x86_mmx* %a0 to i64*
2870  %bc  = bitcast <4 x float> %a1 to <2 x i64>
2871  %ext = extractelement <2 x i64> %bc, i32 0
2872  store i64 %ext, i64* %ptr
2873  ret void
2874}
2875
2876; FIXME: Switch the frontend to use this code.
2877define void @test_mm_storel_pi2(x86_mmx *%a0, <4 x float> %a1) nounwind {
2878; X86-SSE-LABEL: test_mm_storel_pi2:
2879; X86-SSE:       # %bb.0:
2880; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2881; X86-SSE-NEXT:    movlps %xmm0, (%eax) # encoding: [0x0f,0x13,0x00]
2882; X86-SSE-NEXT:    retl # encoding: [0xc3]
2883;
2884; X86-AVX1-LABEL: test_mm_storel_pi2:
2885; X86-AVX1:       # %bb.0:
2886; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2887; X86-AVX1-NEXT:    vmovlps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x13,0x00]
2888; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2889;
2890; X86-AVX512-LABEL: test_mm_storel_pi2:
2891; X86-AVX512:       # %bb.0:
2892; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2893; X86-AVX512-NEXT:    vmovlps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00]
2894; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2895;
2896; X64-SSE-LABEL: test_mm_storel_pi2:
2897; X64-SSE:       # %bb.0:
2898; X64-SSE-NEXT:    movlps %xmm0, (%rdi) # encoding: [0x0f,0x13,0x07]
2899; X64-SSE-NEXT:    retq # encoding: [0xc3]
2900;
2901; X64-AVX1-LABEL: test_mm_storel_pi2:
2902; X64-AVX1:       # %bb.0:
2903; X64-AVX1-NEXT:    vmovlps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x13,0x07]
2904; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2905;
2906; X64-AVX512-LABEL: test_mm_storel_pi2:
2907; X64-AVX512:       # %bb.0:
2908; X64-AVX512-NEXT:    vmovlps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x07]
2909; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2910  %ptr = bitcast x86_mmx* %a0 to <2 x float>*
2911  %ext = shufflevector <4 x float> %a1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
2912  store <2 x float> %ext, <2 x float>* %ptr
2913  ret void
2914}
2915
2916define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
2917; X86-SSE-LABEL: test_mm_storer_ps:
2918; X86-SSE:       # %bb.0:
2919; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2920; X86-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
2921; X86-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
2922; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
2923; X86-SSE-NEXT:    retl # encoding: [0xc3]
2924;
2925; X86-AVX1-LABEL: test_mm_storer_ps:
2926; X86-AVX1:       # %bb.0:
2927; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2928; X86-AVX1-NEXT:    vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2929; X86-AVX1-NEXT:    # xmm0 = xmm0[3,2,1,0]
2930; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
2931; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2932;
2933; X86-AVX512-LABEL: test_mm_storer_ps:
2934; X86-AVX512:       # %bb.0:
2935; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2936; X86-AVX512-NEXT:    vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2937; X86-AVX512-NEXT:    # xmm0 = xmm0[3,2,1,0]
2938; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
2939; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2940;
2941; X64-SSE-LABEL: test_mm_storer_ps:
2942; X64-SSE:       # %bb.0:
2943; X64-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
2944; X64-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
2945; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
2946; X64-SSE-NEXT:    retq # encoding: [0xc3]
2947;
2948; X64-AVX1-LABEL: test_mm_storer_ps:
2949; X64-AVX1:       # %bb.0:
2950; X64-AVX1-NEXT:    vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2951; X64-AVX1-NEXT:    # xmm0 = xmm0[3,2,1,0]
2952; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
2953; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2954;
2955; X64-AVX512-LABEL: test_mm_storer_ps:
2956; X64-AVX512:       # %bb.0:
2957; X64-AVX512-NEXT:    vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
2958; X64-AVX512-NEXT:    # xmm0 = xmm0[3,2,1,0]
2959; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
2960; X64-AVX512-NEXT:    retq # encoding: [0xc3]
2961  %arg0 = bitcast float* %a0 to <4 x float>*
2962  %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
2963  store <4 x float> %shuf, <4 x float>* %arg0, align 16
2964  ret void
2965}
2966
2967define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) {
2968; X86-SSE-LABEL: test_mm_storeu_ps:
2969; X86-SSE:       # %bb.0:
2970; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2971; X86-SSE-NEXT:    movups %xmm0, (%eax) # encoding: [0x0f,0x11,0x00]
2972; X86-SSE-NEXT:    retl # encoding: [0xc3]
2973;
2974; X86-AVX1-LABEL: test_mm_storeu_ps:
2975; X86-AVX1:       # %bb.0:
2976; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2977; X86-AVX1-NEXT:    vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00]
2978; X86-AVX1-NEXT:    retl # encoding: [0xc3]
2979;
2980; X86-AVX512-LABEL: test_mm_storeu_ps:
2981; X86-AVX512:       # %bb.0:
2982; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
2983; X86-AVX512-NEXT:    vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00]
2984; X86-AVX512-NEXT:    retl # encoding: [0xc3]
2985;
2986; X64-SSE-LABEL: test_mm_storeu_ps:
2987; X64-SSE:       # %bb.0:
2988; X64-SSE-NEXT:    movups %xmm0, (%rdi) # encoding: [0x0f,0x11,0x07]
2989; X64-SSE-NEXT:    retq # encoding: [0xc3]
2990;
2991; X64-AVX1-LABEL: test_mm_storeu_ps:
2992; X64-AVX1:       # %bb.0:
2993; X64-AVX1-NEXT:    vmovups %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x11,0x07]
2994; X64-AVX1-NEXT:    retq # encoding: [0xc3]
2995;
2996; X64-AVX512-LABEL: test_mm_storeu_ps:
2997; X64-AVX512:       # %bb.0:
2998; X64-AVX512-NEXT:    vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
2999; X64-AVX512-NEXT:    retq # encoding: [0xc3]
3000  %arg0 = bitcast float* %a0 to <4 x float>*
3001  store <4 x float> %a1, <4 x float>* %arg0, align 1
3002  ret void
3003}
3004
3005define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) {
3006; X86-SSE-LABEL: test_mm_stream_ps:
3007; X86-SSE:       # %bb.0:
3008; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3009; X86-SSE-NEXT:    movntps %xmm0, (%eax) # encoding: [0x0f,0x2b,0x00]
3010; X86-SSE-NEXT:    retl # encoding: [0xc3]
3011;
3012; X86-AVX1-LABEL: test_mm_stream_ps:
3013; X86-AVX1:       # %bb.0:
3014; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3015; X86-AVX1-NEXT:    vmovntps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x2b,0x00]
3016; X86-AVX1-NEXT:    retl # encoding: [0xc3]
3017;
3018; X86-AVX512-LABEL: test_mm_stream_ps:
3019; X86-AVX512:       # %bb.0:
3020; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
3021; X86-AVX512-NEXT:    vmovntps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x00]
3022; X86-AVX512-NEXT:    retl # encoding: [0xc3]
3023;
3024; X64-SSE-LABEL: test_mm_stream_ps:
3025; X64-SSE:       # %bb.0:
3026; X64-SSE-NEXT:    movntps %xmm0, (%rdi) # encoding: [0x0f,0x2b,0x07]
3027; X64-SSE-NEXT:    retq # encoding: [0xc3]
3028;
3029; X64-AVX1-LABEL: test_mm_stream_ps:
3030; X64-AVX1:       # %bb.0:
3031; X64-AVX1-NEXT:    vmovntps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x2b,0x07]
3032; X64-AVX1-NEXT:    retq # encoding: [0xc3]
3033;
3034; X64-AVX512-LABEL: test_mm_stream_ps:
3035; X64-AVX512:       # %bb.0:
3036; X64-AVX512-NEXT:    vmovntps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x07]
3037; X64-AVX512-NEXT:    retq # encoding: [0xc3]
3038  %arg0 = bitcast float* %a0 to <4 x float>*
3039  store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0
3040  ret void
3041}
3042
3043define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3044; SSE-LABEL: test_mm_sub_ps:
3045; SSE:       # %bb.0:
3046; SSE-NEXT:    subps %xmm1, %xmm0 # encoding: [0x0f,0x5c,0xc1]
3047; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3048;
3049; AVX1-LABEL: test_mm_sub_ps:
3050; AVX1:       # %bb.0:
3051; AVX1-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5c,0xc1]
3052; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3053;
3054; AVX512-LABEL: test_mm_sub_ps:
3055; AVX512:       # %bb.0:
3056; AVX512-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5c,0xc1]
3057; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3058  %res = fsub <4 x float> %a0, %a1
3059  ret <4 x float> %res
3060}
3061
3062define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3063; SSE-LABEL: test_mm_sub_ss:
3064; SSE:       # %bb.0:
3065; SSE-NEXT:    subss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5c,0xc1]
3066; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3067;
3068; AVX1-LABEL: test_mm_sub_ss:
3069; AVX1:       # %bb.0:
3070; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5c,0xc1]
3071; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3072;
3073; AVX512-LABEL: test_mm_sub_ss:
3074; AVX512:       # %bb.0:
3075; AVX512-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5c,0xc1]
3076; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3077  %ext0 = extractelement <4 x float> %a0, i32 0
3078  %ext1 = extractelement <4 x float> %a1, i32 0
3079  %fsub = fsub float %ext0, %ext1
3080  %res = insertelement <4 x float> %a0, float %fsub, i32 0
3081  ret <4 x float> %res
3082}
3083
3084define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind {
3085; X86-SSE-LABEL: test_MM_TRANSPOSE4_PS:
3086; X86-SSE:       # %bb.0:
3087; X86-SSE-NEXT:    pushl %esi # encoding: [0x56]
3088; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
3089; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
3090; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
3091; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
3092; X86-SSE-NEXT:    movaps (%esi), %xmm0 # encoding: [0x0f,0x28,0x06]
3093; X86-SSE-NEXT:    movaps (%edx), %xmm1 # encoding: [0x0f,0x28,0x0a]
3094; X86-SSE-NEXT:    movaps (%ecx), %xmm2 # encoding: [0x0f,0x28,0x11]
3095; X86-SSE-NEXT:    movaps (%eax), %xmm3 # encoding: [0x0f,0x28,0x18]
3096; X86-SSE-NEXT:    movaps %xmm0, %xmm4 # encoding: [0x0f,0x28,0xe0]
3097; X86-SSE-NEXT:    unpcklps %xmm1, %xmm4 # encoding: [0x0f,0x14,0xe1]
3098; X86-SSE-NEXT:    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3099; X86-SSE-NEXT:    movaps %xmm2, %xmm5 # encoding: [0x0f,0x28,0xea]
3100; X86-SSE-NEXT:    unpcklps %xmm3, %xmm5 # encoding: [0x0f,0x14,0xeb]
3101; X86-SSE-NEXT:    # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
3102; X86-SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
3103; X86-SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3104; X86-SSE-NEXT:    unpckhps %xmm3, %xmm2 # encoding: [0x0f,0x15,0xd3]
3105; X86-SSE-NEXT:    # xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3106; X86-SSE-NEXT:    movaps %xmm4, %xmm1 # encoding: [0x0f,0x28,0xcc]
3107; X86-SSE-NEXT:    movlhps %xmm5, %xmm1 # encoding: [0x0f,0x16,0xcd]
3108; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm5[0]
3109; X86-SSE-NEXT:    movhlps %xmm4, %xmm5 # encoding: [0x0f,0x12,0xec]
3110; X86-SSE-NEXT:    # xmm5 = xmm4[1],xmm5[1]
3111; X86-SSE-NEXT:    movaps %xmm0, %xmm3 # encoding: [0x0f,0x28,0xd8]
3112; X86-SSE-NEXT:    movlhps %xmm2, %xmm3 # encoding: [0x0f,0x16,0xda]
3113; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0]
3114; X86-SSE-NEXT:    movhlps %xmm0, %xmm2 # encoding: [0x0f,0x12,0xd0]
3115; X86-SSE-NEXT:    # xmm2 = xmm0[1],xmm2[1]
3116; X86-SSE-NEXT:    movaps %xmm1, (%esi) # encoding: [0x0f,0x29,0x0e]
3117; X86-SSE-NEXT:    movaps %xmm5, (%edx) # encoding: [0x0f,0x29,0x2a]
3118; X86-SSE-NEXT:    movaps %xmm3, (%ecx) # encoding: [0x0f,0x29,0x19]
3119; X86-SSE-NEXT:    movaps %xmm2, (%eax) # encoding: [0x0f,0x29,0x10]
3120; X86-SSE-NEXT:    popl %esi # encoding: [0x5e]
3121; X86-SSE-NEXT:    retl # encoding: [0xc3]
3122;
3123; X86-AVX1-LABEL: test_MM_TRANSPOSE4_PS:
3124; X86-AVX1:       # %bb.0:
3125; X86-AVX1-NEXT:    pushl %esi # encoding: [0x56]
3126; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
3127; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
3128; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
3129; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
3130; X86-AVX1-NEXT:    vmovaps (%esi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x06]
3131; X86-AVX1-NEXT:    vmovaps (%edx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
3132; X86-AVX1-NEXT:    vmovaps (%ecx), %xmm2 # encoding: [0xc5,0xf8,0x28,0x11]
3133; X86-AVX1-NEXT:    vmovaps (%eax), %xmm3 # encoding: [0xc5,0xf8,0x28,0x18]
3134; X86-AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x14,0xe1]
3135; X86-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3136; X86-AVX1-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # encoding: [0xc5,0xe8,0x14,0xeb]
3137; X86-AVX1-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3138; X86-AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
3139; X86-AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3140; X86-AVX1-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x15,0xcb]
3141; X86-AVX1-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3142; X86-AVX1-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # encoding: [0xc5,0xd8,0x16,0xd5]
3143; X86-AVX1-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3144; X86-AVX1-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # encoding: [0xc5,0xd9,0x15,0xdd]
3145; X86-AVX1-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3146; X86-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x16,0xe1]
3147; X86-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3148; X86-AVX1-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x15,0xc1]
3149; X86-AVX1-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3150; X86-AVX1-NEXT:    vmovaps %xmm2, (%esi) # encoding: [0xc5,0xf8,0x29,0x16]
3151; X86-AVX1-NEXT:    vmovaps %xmm3, (%edx) # encoding: [0xc5,0xf8,0x29,0x1a]
3152; X86-AVX1-NEXT:    vmovaps %xmm4, (%ecx) # encoding: [0xc5,0xf8,0x29,0x21]
3153; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
3154; X86-AVX1-NEXT:    popl %esi # encoding: [0x5e]
3155; X86-AVX1-NEXT:    retl # encoding: [0xc3]
3156;
3157; X86-AVX512-LABEL: test_MM_TRANSPOSE4_PS:
3158; X86-AVX512:       # %bb.0:
3159; X86-AVX512-NEXT:    pushl %esi # encoding: [0x56]
3160; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
3161; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
3162; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
3163; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
3164; X86-AVX512-NEXT:    vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06]
3165; X86-AVX512-NEXT:    vmovaps (%edx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0a]
3166; X86-AVX512-NEXT:    vmovaps (%ecx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x11]
3167; X86-AVX512-NEXT:    vmovaps (%eax), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x18]
3168; X86-AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1]
3169; X86-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3170; X86-AVX512-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb]
3171; X86-AVX512-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3172; X86-AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
3173; X86-AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3174; X86-AVX512-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x15,0xcb]
3175; X86-AVX512-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3176; X86-AVX512-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x16,0xd5]
3177; X86-AVX512-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3178; X86-AVX512-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x15,0xdd]
3179; X86-AVX512-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3180; X86-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xe1]
3181; X86-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3182; X86-AVX512-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1]
3183; X86-AVX512-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3184; X86-AVX512-NEXT:    vmovaps %xmm2, (%esi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x16]
3185; X86-AVX512-NEXT:    vmovaps %xmm3, (%edx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1a]
3186; X86-AVX512-NEXT:    vmovaps %xmm4, (%ecx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x21]
3187; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
3188; X86-AVX512-NEXT:    popl %esi # encoding: [0x5e]
3189; X86-AVX512-NEXT:    retl # encoding: [0xc3]
3190;
3191; X64-SSE-LABEL: test_MM_TRANSPOSE4_PS:
3192; X64-SSE:       # %bb.0:
3193; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
3194; X64-SSE-NEXT:    movaps (%rsi), %xmm1 # encoding: [0x0f,0x28,0x0e]
3195; X64-SSE-NEXT:    movaps (%rdx), %xmm2 # encoding: [0x0f,0x28,0x12]
3196; X64-SSE-NEXT:    movaps (%rcx), %xmm3 # encoding: [0x0f,0x28,0x19]
3197; X64-SSE-NEXT:    movaps %xmm0, %xmm4 # encoding: [0x0f,0x28,0xe0]
3198; X64-SSE-NEXT:    unpcklps %xmm1, %xmm4 # encoding: [0x0f,0x14,0xe1]
3199; X64-SSE-NEXT:    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3200; X64-SSE-NEXT:    movaps %xmm2, %xmm5 # encoding: [0x0f,0x28,0xea]
3201; X64-SSE-NEXT:    unpcklps %xmm3, %xmm5 # encoding: [0x0f,0x14,0xeb]
3202; X64-SSE-NEXT:    # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
3203; X64-SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
3204; X64-SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3205; X64-SSE-NEXT:    unpckhps %xmm3, %xmm2 # encoding: [0x0f,0x15,0xd3]
3206; X64-SSE-NEXT:    # xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3207; X64-SSE-NEXT:    movaps %xmm4, %xmm1 # encoding: [0x0f,0x28,0xcc]
3208; X64-SSE-NEXT:    movlhps %xmm5, %xmm1 # encoding: [0x0f,0x16,0xcd]
3209; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm5[0]
3210; X64-SSE-NEXT:    movhlps %xmm4, %xmm5 # encoding: [0x0f,0x12,0xec]
3211; X64-SSE-NEXT:    # xmm5 = xmm4[1],xmm5[1]
3212; X64-SSE-NEXT:    movaps %xmm0, %xmm3 # encoding: [0x0f,0x28,0xd8]
3213; X64-SSE-NEXT:    movlhps %xmm2, %xmm3 # encoding: [0x0f,0x16,0xda]
3214; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0]
3215; X64-SSE-NEXT:    movhlps %xmm0, %xmm2 # encoding: [0x0f,0x12,0xd0]
3216; X64-SSE-NEXT:    # xmm2 = xmm0[1],xmm2[1]
3217; X64-SSE-NEXT:    movaps %xmm1, (%rdi) # encoding: [0x0f,0x29,0x0f]
3218; X64-SSE-NEXT:    movaps %xmm5, (%rsi) # encoding: [0x0f,0x29,0x2e]
3219; X64-SSE-NEXT:    movaps %xmm3, (%rdx) # encoding: [0x0f,0x29,0x1a]
3220; X64-SSE-NEXT:    movaps %xmm2, (%rcx) # encoding: [0x0f,0x29,0x11]
3221; X64-SSE-NEXT:    retq # encoding: [0xc3]
3222;
3223; X64-AVX1-LABEL: test_MM_TRANSPOSE4_PS:
3224; X64-AVX1:       # %bb.0:
3225; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07]
3226; X64-AVX1-NEXT:    vmovaps (%rsi), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0e]
3227; X64-AVX1-NEXT:    vmovaps (%rdx), %xmm2 # encoding: [0xc5,0xf8,0x28,0x12]
3228; X64-AVX1-NEXT:    vmovaps (%rcx), %xmm3 # encoding: [0xc5,0xf8,0x28,0x19]
3229; X64-AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x14,0xe1]
3230; X64-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3231; X64-AVX1-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # encoding: [0xc5,0xe8,0x14,0xeb]
3232; X64-AVX1-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3233; X64-AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
3234; X64-AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3235; X64-AVX1-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x15,0xcb]
3236; X64-AVX1-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3237; X64-AVX1-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # encoding: [0xc5,0xd8,0x16,0xd5]
3238; X64-AVX1-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3239; X64-AVX1-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # encoding: [0xc5,0xd9,0x15,0xdd]
3240; X64-AVX1-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3241; X64-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x16,0xe1]
3242; X64-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3243; X64-AVX1-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x15,0xc1]
3244; X64-AVX1-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3245; X64-AVX1-NEXT:    vmovaps %xmm2, (%rdi) # encoding: [0xc5,0xf8,0x29,0x17]
3246; X64-AVX1-NEXT:    vmovaps %xmm3, (%rsi) # encoding: [0xc5,0xf8,0x29,0x1e]
3247; X64-AVX1-NEXT:    vmovaps %xmm4, (%rdx) # encoding: [0xc5,0xf8,0x29,0x22]
3248; X64-AVX1-NEXT:    vmovaps %xmm0, (%rcx) # encoding: [0xc5,0xf8,0x29,0x01]
3249; X64-AVX1-NEXT:    retq # encoding: [0xc3]
3250;
3251; X64-AVX512-LABEL: test_MM_TRANSPOSE4_PS:
3252; X64-AVX512:       # %bb.0:
3253; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
3254; X64-AVX512-NEXT:    vmovaps (%rsi), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0e]
3255; X64-AVX512-NEXT:    vmovaps (%rdx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x12]
3256; X64-AVX512-NEXT:    vmovaps (%rcx), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x19]
3257; X64-AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1]
3258; X64-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3259; X64-AVX512-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb]
3260; X64-AVX512-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3261; X64-AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
3262; X64-AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3263; X64-AVX512-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x15,0xcb]
3264; X64-AVX512-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3265; X64-AVX512-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x16,0xd5]
3266; X64-AVX512-NEXT:    # xmm2 = xmm4[0],xmm5[0]
3267; X64-AVX512-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x15,0xdd]
3268; X64-AVX512-NEXT:    # xmm3 = xmm4[1],xmm5[1]
3269; X64-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xe1]
3270; X64-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0]
3271; X64-AVX512-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1]
3272; X64-AVX512-NEXT:    # xmm0 = xmm0[1],xmm1[1]
3273; X64-AVX512-NEXT:    vmovaps %xmm2, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x17]
3274; X64-AVX512-NEXT:    vmovaps %xmm3, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1e]
3275; X64-AVX512-NEXT:    vmovaps %xmm4, (%rdx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x22]
3276; X64-AVX512-NEXT:    vmovaps %xmm0, (%rcx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x01]
3277; X64-AVX512-NEXT:    retq # encoding: [0xc3]
3278  %row0 = load <4 x float>, <4 x float>* %a0, align 16
3279  %row1 = load <4 x float>, <4 x float>* %a1, align 16
3280  %row2 = load <4 x float>, <4 x float>* %a2, align 16
3281  %row3 = load <4 x float>, <4 x float>* %a3, align 16
3282  %tmp0 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3283  %tmp2 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3284  %tmp1 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3285  %tmp3 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3286  %res0 = shufflevector <4 x float> %tmp0, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
3287  %res1 = shufflevector <4 x float> %tmp2, <4 x float> %tmp0, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
3288  %res2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
3289  %res3 = shufflevector <4 x float> %tmp3, <4 x float> %tmp1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
3290  store <4 x float> %res0, <4 x float>* %a0, align 16
3291  store <4 x float> %res1, <4 x float>* %a1, align 16
3292  store <4 x float> %res2, <4 x float>* %a2, align 16
3293  store <4 x float> %res3, <4 x float>* %a3, align 16
3294  ret void
3295}
3296
3297define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3298; SSE-LABEL: test_mm_ucomieq_ss:
3299; SSE:       # %bb.0:
3300; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3301; SSE-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
3302; SSE-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
3303; SSE-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
3304; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3305; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3306;
3307; AVX1-LABEL: test_mm_ucomieq_ss:
3308; AVX1:       # %bb.0:
3309; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3310; AVX1-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
3311; AVX1-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
3312; AVX1-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
3313; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3314; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3315;
3316; AVX512-LABEL: test_mm_ucomieq_ss:
3317; AVX512:       # %bb.0:
3318; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3319; AVX512-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
3320; AVX512-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
3321; AVX512-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
3322; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3323; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3324  %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
3325  ret i32 %res
3326}
3327declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
3328
3329define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3330; SSE-LABEL: test_mm_ucomige_ss:
3331; SSE:       # %bb.0:
3332; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3333; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3334; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3335; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3336;
3337; AVX1-LABEL: test_mm_ucomige_ss:
3338; AVX1:       # %bb.0:
3339; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3340; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3341; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3342; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3343;
3344; AVX512-LABEL: test_mm_ucomige_ss:
3345; AVX512:       # %bb.0:
3346; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3347; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3348; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3349; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3350  %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1)
3351  ret i32 %res
3352}
3353declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
3354
3355define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3356; SSE-LABEL: test_mm_ucomigt_ss:
3357; SSE:       # %bb.0:
3358; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3359; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3360; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3361; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3362;
3363; AVX1-LABEL: test_mm_ucomigt_ss:
3364; AVX1:       # %bb.0:
3365; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3366; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3367; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3368; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3369;
3370; AVX512-LABEL: test_mm_ucomigt_ss:
3371; AVX512:       # %bb.0:
3372; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3373; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3374; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3375; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3376  %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1)
3377  ret i32 %res
3378}
3379declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
3380
3381define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3382; SSE-LABEL: test_mm_ucomile_ss:
3383; SSE:       # %bb.0:
3384; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3385; SSE-NEXT:    ucomiss %xmm0, %xmm1 # encoding: [0x0f,0x2e,0xc8]
3386; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3387; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3388;
3389; AVX1-LABEL: test_mm_ucomile_ss:
3390; AVX1:       # %bb.0:
3391; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3392; AVX1-NEXT:    vucomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2e,0xc8]
3393; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3394; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3395;
3396; AVX512-LABEL: test_mm_ucomile_ss:
3397; AVX512:       # %bb.0:
3398; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3399; AVX512-NEXT:    vucomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
3400; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
3401; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3402  %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1)
3403  ret i32 %res
3404}
3405declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
3406
3407define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3408; SSE-LABEL: test_mm_ucomilt_ss:
3409; SSE:       # %bb.0:
3410; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3411; SSE-NEXT:    ucomiss %xmm0, %xmm1 # encoding: [0x0f,0x2e,0xc8]
3412; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3413; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3414;
3415; AVX1-LABEL: test_mm_ucomilt_ss:
3416; AVX1:       # %bb.0:
3417; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3418; AVX1-NEXT:    vucomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2e,0xc8]
3419; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3420; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3421;
3422; AVX512-LABEL: test_mm_ucomilt_ss:
3423; AVX512:       # %bb.0:
3424; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
3425; AVX512-NEXT:    vucomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
3426; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
3427; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3428  %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1)
3429  ret i32 %res
3430}
3431declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
3432
3433define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
3434; SSE-LABEL: test_mm_ucomineq_ss:
3435; SSE:       # %bb.0:
3436; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
3437; SSE-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
3438; SSE-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
3439; SSE-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
3440; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3441; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3442;
3443; AVX1-LABEL: test_mm_ucomineq_ss:
3444; AVX1:       # %bb.0:
3445; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
3446; AVX1-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
3447; AVX1-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
3448; AVX1-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
3449; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3450; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3451;
3452; AVX512-LABEL: test_mm_ucomineq_ss:
3453; AVX512:       # %bb.0:
3454; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
3455; AVX512-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
3456; AVX512-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
3457; AVX512-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
3458; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
3459; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3460  %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1)
3461  ret i32 %res
3462}
3463declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
3464
3465define <4 x float> @test_mm_undefined_ps() {
3466; CHECK-LABEL: test_mm_undefined_ps:
3467; CHECK:       # %bb.0:
3468; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3469  ret <4 x float> undef
3470}
3471
3472define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3473; SSE-LABEL: test_mm_unpackhi_ps:
3474; SSE:       # %bb.0:
3475; SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
3476; SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3477; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3478;
3479; AVX1-LABEL: test_mm_unpackhi_ps:
3480; AVX1:       # %bb.0:
3481; AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
3482; AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3483; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3484;
3485; AVX512-LABEL: test_mm_unpackhi_ps:
3486; AVX512:       # %bb.0:
3487; AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
3488; AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3489; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3490  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
3491  ret <4 x float> %res
3492}
3493
3494define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3495; SSE-LABEL: test_mm_unpacklo_ps:
3496; SSE:       # %bb.0:
3497; SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
3498; SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3499; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3500;
3501; AVX1-LABEL: test_mm_unpacklo_ps:
3502; AVX1:       # %bb.0:
3503; AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x14,0xc1]
3504; AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3505; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3506;
3507; AVX512-LABEL: test_mm_unpacklo_ps:
3508; AVX512:       # %bb.0:
3509; AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1]
3510; AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3511; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3512  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
3513  ret <4 x float> %res
3514}
3515
3516define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
3517; SSE-LABEL: test_mm_xor_ps:
3518; SSE:       # %bb.0:
3519; SSE-NEXT:    xorps %xmm1, %xmm0 # encoding: [0x0f,0x57,0xc1]
3520; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3521;
3522; AVX1-LABEL: test_mm_xor_ps:
3523; AVX1:       # %bb.0:
3524; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc1]
3525; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3526;
3527; AVX512-LABEL: test_mm_xor_ps:
3528; AVX512:       # %bb.0:
3529; AVX512-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1]
3530; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
3531  %arg0 = bitcast <4 x float> %a0 to <4 x i32>
3532  %arg1 = bitcast <4 x float> %a1 to <4 x i32>
3533  %res = xor <4 x i32> %arg0, %arg1
3534  %bc = bitcast <4 x i32> %res to <4 x float>
3535  ret <4 x float> %bc
3536}
3537
3538!0 = !{i32 1}
3539