1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -O3 -mtriple=x86_64-linux-generic -mattr=avx < %s | FileCheck %s
3
4; Bug 45833:
5; The SplitVecRes_MSTORE method should split a extended value type
6; according to the halving of the enveloping type to avoid all sorts
7; of inconsistencies downstream. For example for a extended value type
8; with VL=14 and enveloping type VL=16 that is split 8/8, the extended
9; type should be split 8/6 and not 7/7. This also accounts for hi masked
10; store that get zero storage size (and are unused).
11
12define void @mstore_split9(<9 x float> %value, <9 x float>* %addr, <9 x i1> %mask) {
13; CHECK-LABEL: mstore_split9:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
16; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
17; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
18; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
19; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
20; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
21; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
22; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
23; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
24; CHECK-NEXT:    vmovd %eax, %xmm2
25; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
26; CHECK-NEXT:    vmaskmovps %ymm1, %ymm2, 32(%rdi)
27; CHECK-NEXT:    vmovd %esi, %xmm1
28; CHECK-NEXT:    vpinsrw $1, %edx, %xmm1, %xmm1
29; CHECK-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
30; CHECK-NEXT:    vpinsrw $3, %r8d, %xmm1, %xmm1
31; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
32; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
33; CHECK-NEXT:    vpinsrw $4, %r9d, %xmm1, %xmm1
34; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
35; CHECK-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
36; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
37; CHECK-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
38; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
39; CHECK-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
40; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
41; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
42; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
43; CHECK-NEXT:    vmaskmovps %ymm0, %ymm1, (%rdi)
44; CHECK-NEXT:    vzeroupper
45; CHECK-NEXT:    retq
46  call void @llvm.masked.store.v9f32.p0v9f32(<9 x float> %value, <9 x float>* %addr, i32 4, <9 x i1>%mask)
47  ret void
48}
49
50define void @mstore_split13(<13 x float> %value, <13 x float>* %addr, <13 x i1> %mask) {
51; CHECK-LABEL: mstore_split13:
52; CHECK:       # %bb.0:
53; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
54; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
55; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
56; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
57; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
58; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
59; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
60; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
61; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
62; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
63; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
64; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
65; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
66; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
67; CHECK-NEXT:    vmovd %eax, %xmm2
68; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
69; CHECK-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
70; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
71; CHECK-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
72; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
73; CHECK-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
74; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
75; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
76; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
77; CHECK-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
78; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
79; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
80; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
81; CHECK-NEXT:    vmaskmovps %ymm1, %ymm2, 32(%rdi)
82; CHECK-NEXT:    vmovd %esi, %xmm1
83; CHECK-NEXT:    vpinsrw $1, %edx, %xmm1, %xmm1
84; CHECK-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
85; CHECK-NEXT:    vpinsrw $3, %r8d, %xmm1, %xmm1
86; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
87; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
88; CHECK-NEXT:    vpinsrw $4, %r9d, %xmm1, %xmm1
89; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
90; CHECK-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
91; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
92; CHECK-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
93; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
94; CHECK-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
95; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
96; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
97; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
98; CHECK-NEXT:    vmaskmovps %ymm0, %ymm1, (%rdi)
99; CHECK-NEXT:    vzeroupper
100; CHECK-NEXT:    retq
101  call void @llvm.masked.store.v13f32.p0v13f32(<13 x float> %value, <13 x float>* %addr, i32 4, <13 x i1>%mask)
102  ret void
103}
104
105define void @mstore_split14(<14 x float> %value, <14 x float>* %addr, <14 x i1> %mask) {
106; CHECK-LABEL: mstore_split14:
107; CHECK:       # %bb.0:
108; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
109; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
110; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
111; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
112; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
113; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
114; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
115; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
116; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
117; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
118; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
119; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
120; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
121; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
122; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
123; CHECK-NEXT:    vmovd %eax, %xmm2
124; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
125; CHECK-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
126; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
127; CHECK-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
128; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
129; CHECK-NEXT:    vpinsrw $3, %eax, %xmm2, %xmm2
130; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
131; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
132; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
133; CHECK-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
134; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
135; CHECK-NEXT:    vpinsrw $5, %eax, %xmm2, %xmm2
136; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
137; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
138; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
139; CHECK-NEXT:    vmaskmovps %ymm1, %ymm2, 32(%rdi)
140; CHECK-NEXT:    vmovd %esi, %xmm1
141; CHECK-NEXT:    vpinsrw $1, %edx, %xmm1, %xmm1
142; CHECK-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm1
143; CHECK-NEXT:    vpinsrw $3, %r8d, %xmm1, %xmm1
144; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
145; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
146; CHECK-NEXT:    vpinsrw $4, %r9d, %xmm1, %xmm1
147; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
148; CHECK-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
149; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
150; CHECK-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
151; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %eax
152; CHECK-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
153; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
154; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
155; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
156; CHECK-NEXT:    vmaskmovps %ymm0, %ymm1, (%rdi)
157; CHECK-NEXT:    vzeroupper
158; CHECK-NEXT:    retq
159  call void @llvm.masked.store.v14f32.p0v14f32(<14 x float> %value, <14 x float>* %addr, i32 4, <14 x i1>%mask)
160  ret void
161}
162
163define void @mstore_split17(<17 x float> %value, <17 x float>* %addr, <17 x i1> %mask) {
164; CHECK-LABEL: mstore_split17:
165; CHECK:       # %bb.0:
166; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
167; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
168; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
169; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
170; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
171; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
172; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
173; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
174; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
175; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
176; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
177; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
178; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
179; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
180; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
181; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
182; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
183; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
184; CHECK-NEXT:    vmovd %eax, %xmm3
185; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
186; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
187; CHECK-NEXT:    vmaskmovps %ymm2, %ymm3, 64(%rdi)
188; CHECK-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
189; CHECK-NEXT:    vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
190; CHECK-NEXT:    vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2
191; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
192; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
193; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
194; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
195; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2
196; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2
197; CHECK-NEXT:    vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2
198; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
199; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
200; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
201; CHECK-NEXT:    vmaskmovps %ymm1, %ymm2, 32(%rdi)
202; CHECK-NEXT:    vmovd %esi, %xmm1
203; CHECK-NEXT:    vpinsrb $2, %edx, %xmm1, %xmm1
204; CHECK-NEXT:    vpinsrb $4, %ecx, %xmm1, %xmm1
205; CHECK-NEXT:    vpinsrb $6, %r8d, %xmm1, %xmm1
206; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
207; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
208; CHECK-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
209; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1
210; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1
211; CHECK-NEXT:    vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1
212; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
213; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
214; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
215; CHECK-NEXT:    vmaskmovps %ymm0, %ymm1, (%rdi)
216; CHECK-NEXT:    vzeroupper
217; CHECK-NEXT:    retq
218  call void @llvm.masked.store.v17f32.p0v17f32(<17 x float> %value, <17 x float>* %addr, i32 4, <17 x i1>%mask)
219  ret void
220}
221
222define void @mstore_split23(<23 x float> %value, <23 x float>* %addr, <23 x i1> %mask) {
223; CHECK-LABEL: mstore_split23:
224; CHECK:       # %bb.0:
225; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
226; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
227; CHECK-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
228; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
229; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
230; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
231; CHECK-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
232; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
233; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
234; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
235; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
236; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
237; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
238; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
239; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
240; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
241; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
242; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
243; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
244; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
245; CHECK-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
246; CHECK-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
247; CHECK-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
248; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
249; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
250; CHECK-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
251; CHECK-NEXT:    vpinsrb $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3
252; CHECK-NEXT:    vpinsrb $4, {{[0-9]+}}(%rsp), %xmm3, %xmm3
253; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm3, %xmm3
254; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
255; CHECK-NEXT:    vpslld $31, %xmm4, %xmm4
256; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm3, %xmm3
257; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm3, %xmm3
258; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm3, %xmm3
259; CHECK-NEXT:    vpinsrb $14, {{[0-9]+}}(%rsp), %xmm3, %xmm3
260; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
261; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
262; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
263; CHECK-NEXT:    vmaskmovps %ymm2, %ymm3, 32(%rdi)
264; CHECK-NEXT:    vmovd %eax, %xmm2
265; CHECK-NEXT:    vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
266; CHECK-NEXT:    vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2
267; CHECK-NEXT:    vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
268; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
269; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
270; CHECK-NEXT:    vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
271; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2
272; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2
273; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
274; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
275; CHECK-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
276; CHECK-NEXT:    vmaskmovps %ymm1, %ymm2, 64(%rdi)
277; CHECK-NEXT:    vmovd %esi, %xmm1
278; CHECK-NEXT:    vpinsrb $2, %edx, %xmm1, %xmm1
279; CHECK-NEXT:    vpinsrb $4, %ecx, %xmm1, %xmm1
280; CHECK-NEXT:    vpinsrb $6, %r8d, %xmm1, %xmm1
281; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
282; CHECK-NEXT:    vpslld $31, %xmm2, %xmm2
283; CHECK-NEXT:    vpinsrb $8, %r9d, %xmm1, %xmm1
284; CHECK-NEXT:    vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1
285; CHECK-NEXT:    vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1
286; CHECK-NEXT:    vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1
287; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
288; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1
289; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
290; CHECK-NEXT:    vmaskmovps %ymm0, %ymm1, (%rdi)
291; CHECK-NEXT:    vzeroupper
292; CHECK-NEXT:    retq
293  call void @llvm.masked.store.v23f32.p0v23f32(<23 x float> %value, <23 x float>* %addr, i32 4, <23 x i1>%mask)
294  ret void
295}
296
297declare void @llvm.masked.store.v9f32.p0v9f32(<9 x float>, <9 x float>*, i32, <9 x i1>)
298declare void @llvm.masked.store.v13f32.p0v13f32(<13 x float>, <13 x float>*, i32, <13 x i1>)
299declare void @llvm.masked.store.v14f32.p0v14f32(<14 x float>, <14 x float>*, i32, <14 x i1>)
300declare void @llvm.masked.store.v17f32.p0v17f32(<17 x float>, <17 x float>*, i32, <17 x i1>)
301declare void @llvm.masked.store.v23f32.p0v23f32(<23 x float>, <23 x float>*, i32, <23 x i1>)
302