1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse  | FileCheck %s --check-prefix=SSE
3; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE
4; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST
5; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx  | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f  -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
8; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
9; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512dq -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
10; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f  -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
11; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
12
13; https://llvm.org/bugs/show_bug.cgi?id=27100
14
15define void @memset_16_nonzero_bytes(i8* %x) {
16; SSE-LABEL: memset_16_nonzero_bytes:
17; SSE:       # %bb.0:
18; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
19; SSE-NEXT:    movq %rax, 8(%rdi)
20; SSE-NEXT:    movq %rax, (%rdi)
21; SSE-NEXT:    retq
22;
23; SSE2FAST-LABEL: memset_16_nonzero_bytes:
24; SSE2FAST:       # %bb.0:
25; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
26; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
27; SSE2FAST-NEXT:    retq
28;
29; AVX-LABEL: memset_16_nonzero_bytes:
30; AVX:       # %bb.0:
31; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
32; AVX-NEXT:    vmovups %xmm0, (%rdi)
33; AVX-NEXT:    retq
34  %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 16, i64 -1)
35  ret void
36}
37
38define void @memset_32_nonzero_bytes(i8* %x) {
39; SSE-LABEL: memset_32_nonzero_bytes:
40; SSE:       # %bb.0:
41; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
42; SSE-NEXT:    movq %rax, 24(%rdi)
43; SSE-NEXT:    movq %rax, 16(%rdi)
44; SSE-NEXT:    movq %rax, 8(%rdi)
45; SSE-NEXT:    movq %rax, (%rdi)
46; SSE-NEXT:    retq
47;
48; SSE2FAST-LABEL: memset_32_nonzero_bytes:
49; SSE2FAST:       # %bb.0:
50; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
51; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
52; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
53; SSE2FAST-NEXT:    retq
54;
55; AVX-LABEL: memset_32_nonzero_bytes:
56; AVX:       # %bb.0:
57; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
58; AVX-NEXT:    vmovups %ymm0, (%rdi)
59; AVX-NEXT:    vzeroupper
60; AVX-NEXT:    retq
61  %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 32, i64 -1)
62  ret void
63}
64
65define void @memset_64_nonzero_bytes(i8* %x) {
66; SSE-LABEL: memset_64_nonzero_bytes:
67; SSE:       # %bb.0:
68; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
69; SSE-NEXT:    movq %rax, 56(%rdi)
70; SSE-NEXT:    movq %rax, 48(%rdi)
71; SSE-NEXT:    movq %rax, 40(%rdi)
72; SSE-NEXT:    movq %rax, 32(%rdi)
73; SSE-NEXT:    movq %rax, 24(%rdi)
74; SSE-NEXT:    movq %rax, 16(%rdi)
75; SSE-NEXT:    movq %rax, 8(%rdi)
76; SSE-NEXT:    movq %rax, (%rdi)
77; SSE-NEXT:    retq
78;
79; SSE2FAST-LABEL: memset_64_nonzero_bytes:
80; SSE2FAST:       # %bb.0:
81; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
82; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
83; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
84; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
85; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
86; SSE2FAST-NEXT:    retq
87;
88; AVX1-LABEL: memset_64_nonzero_bytes:
89; AVX1:       # %bb.0:
90; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
91; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
92; AVX1-NEXT:    vmovups %ymm0, (%rdi)
93; AVX1-NEXT:    vzeroupper
94; AVX1-NEXT:    retq
95;
96; AVX2-LABEL: memset_64_nonzero_bytes:
97; AVX2:       # %bb.0:
98; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
99; AVX2-NEXT:    vmovups %ymm0, 32(%rdi)
100; AVX2-NEXT:    vmovups %ymm0, (%rdi)
101; AVX2-NEXT:    vzeroupper
102; AVX2-NEXT:    retq
103;
104; AVX512F-LABEL: memset_64_nonzero_bytes:
105; AVX512F:       # %bb.0:
106; AVX512F-NEXT:    vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
107; AVX512F-NEXT:    vmovups %zmm0, (%rdi)
108; AVX512F-NEXT:    vzeroupper
109; AVX512F-NEXT:    retq
110;
111; AVX512BW-LABEL: memset_64_nonzero_bytes:
112; AVX512BW:       # %bb.0:
113; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
114; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
115; AVX512BW-NEXT:    vzeroupper
116; AVX512BW-NEXT:    retq
117; AVX512NW-NEXT: retq
118  %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 64, i64 -1)
119  ret void
120}
121
122define void @memset_128_nonzero_bytes(i8* %x) {
123; SSE-LABEL: memset_128_nonzero_bytes:
124; SSE:       # %bb.0:
125; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
126; SSE-NEXT:    movq %rax, 120(%rdi)
127; SSE-NEXT:    movq %rax, 112(%rdi)
128; SSE-NEXT:    movq %rax, 104(%rdi)
129; SSE-NEXT:    movq %rax, 96(%rdi)
130; SSE-NEXT:    movq %rax, 88(%rdi)
131; SSE-NEXT:    movq %rax, 80(%rdi)
132; SSE-NEXT:    movq %rax, 72(%rdi)
133; SSE-NEXT:    movq %rax, 64(%rdi)
134; SSE-NEXT:    movq %rax, 56(%rdi)
135; SSE-NEXT:    movq %rax, 48(%rdi)
136; SSE-NEXT:    movq %rax, 40(%rdi)
137; SSE-NEXT:    movq %rax, 32(%rdi)
138; SSE-NEXT:    movq %rax, 24(%rdi)
139; SSE-NEXT:    movq %rax, 16(%rdi)
140; SSE-NEXT:    movq %rax, 8(%rdi)
141; SSE-NEXT:    movq %rax, (%rdi)
142; SSE-NEXT:    retq
143;
144; SSE2FAST-LABEL: memset_128_nonzero_bytes:
145; SSE2FAST:       # %bb.0:
146; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
147; SSE2FAST-NEXT:    movups %xmm0, 112(%rdi)
148; SSE2FAST-NEXT:    movups %xmm0, 96(%rdi)
149; SSE2FAST-NEXT:    movups %xmm0, 80(%rdi)
150; SSE2FAST-NEXT:    movups %xmm0, 64(%rdi)
151; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
152; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
153; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
154; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
155; SSE2FAST-NEXT:    retq
156;
157; AVX1-LABEL: memset_128_nonzero_bytes:
158; AVX1:       # %bb.0:
159; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
160; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
161; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
162; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
163; AVX1-NEXT:    vmovups %ymm0, (%rdi)
164; AVX1-NEXT:    vzeroupper
165; AVX1-NEXT:    retq
166;
167; AVX2-LABEL: memset_128_nonzero_bytes:
168; AVX2:       # %bb.0:
169; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
170; AVX2-NEXT:    vmovups %ymm0, 96(%rdi)
171; AVX2-NEXT:    vmovups %ymm0, 64(%rdi)
172; AVX2-NEXT:    vmovups %ymm0, 32(%rdi)
173; AVX2-NEXT:    vmovups %ymm0, (%rdi)
174; AVX2-NEXT:    vzeroupper
175; AVX2-NEXT:    retq
176;
177; AVX512F-LABEL: memset_128_nonzero_bytes:
178; AVX512F:       # %bb.0:
179; AVX512F-NEXT:    vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
180; AVX512F-NEXT:    vmovups %zmm0, 64(%rdi)
181; AVX512F-NEXT:    vmovups %zmm0, (%rdi)
182; AVX512F-NEXT:    vzeroupper
183; AVX512F-NEXT:    retq
184;
185; AVX512BW-LABEL: memset_128_nonzero_bytes:
186; AVX512BW:       # %bb.0:
187; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
188; AVX512BW-NEXT:    vmovups %zmm0, 64(%rdi)
189; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
190; AVX512BW-NEXT:    vzeroupper
191; AVX512BW-NEXT:    retq
192  %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 128, i64 -1)
193  ret void
194}
195
196define void @memset_256_nonzero_bytes(i8* %x) {
197; SSE-LABEL: memset_256_nonzero_bytes:
198; SSE:       # %bb.0:
199; SSE-NEXT:    pushq %rax
200; SSE-NEXT:    .cfi_def_cfa_offset 16
201; SSE-NEXT:    movl $256, %edx # imm = 0x100
202; SSE-NEXT:    movl $42, %esi
203; SSE-NEXT:    callq memset@PLT
204; SSE-NEXT:    popq %rax
205; SSE-NEXT:    .cfi_def_cfa_offset 8
206; SSE-NEXT:    retq
207;
208; SSE2FAST-LABEL: memset_256_nonzero_bytes:
209; SSE2FAST:       # %bb.0:
210; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
211; SSE2FAST-NEXT:    movups %xmm0, 240(%rdi)
212; SSE2FAST-NEXT:    movups %xmm0, 224(%rdi)
213; SSE2FAST-NEXT:    movups %xmm0, 208(%rdi)
214; SSE2FAST-NEXT:    movups %xmm0, 192(%rdi)
215; SSE2FAST-NEXT:    movups %xmm0, 176(%rdi)
216; SSE2FAST-NEXT:    movups %xmm0, 160(%rdi)
217; SSE2FAST-NEXT:    movups %xmm0, 144(%rdi)
218; SSE2FAST-NEXT:    movups %xmm0, 128(%rdi)
219; SSE2FAST-NEXT:    movups %xmm0, 112(%rdi)
220; SSE2FAST-NEXT:    movups %xmm0, 96(%rdi)
221; SSE2FAST-NEXT:    movups %xmm0, 80(%rdi)
222; SSE2FAST-NEXT:    movups %xmm0, 64(%rdi)
223; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
224; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
225; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
226; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
227; SSE2FAST-NEXT:    retq
228;
229; AVX1-LABEL: memset_256_nonzero_bytes:
230; AVX1:       # %bb.0:
231; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
232; AVX1-NEXT:    vmovups %ymm0, 224(%rdi)
233; AVX1-NEXT:    vmovups %ymm0, 192(%rdi)
234; AVX1-NEXT:    vmovups %ymm0, 160(%rdi)
235; AVX1-NEXT:    vmovups %ymm0, 128(%rdi)
236; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
237; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
238; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
239; AVX1-NEXT:    vmovups %ymm0, (%rdi)
240; AVX1-NEXT:    vzeroupper
241; AVX1-NEXT:    retq
242;
243; AVX2-LABEL: memset_256_nonzero_bytes:
244; AVX2:       # %bb.0:
245; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
246; AVX2-NEXT:    vmovups %ymm0, 224(%rdi)
247; AVX2-NEXT:    vmovups %ymm0, 192(%rdi)
248; AVX2-NEXT:    vmovups %ymm0, 160(%rdi)
249; AVX2-NEXT:    vmovups %ymm0, 128(%rdi)
250; AVX2-NEXT:    vmovups %ymm0, 96(%rdi)
251; AVX2-NEXT:    vmovups %ymm0, 64(%rdi)
252; AVX2-NEXT:    vmovups %ymm0, 32(%rdi)
253; AVX2-NEXT:    vmovups %ymm0, (%rdi)
254; AVX2-NEXT:    vzeroupper
255; AVX2-NEXT:    retq
256;
257; AVX512F-LABEL: memset_256_nonzero_bytes:
258; AVX512F:       # %bb.0:
259; AVX512F-NEXT:    vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
260; AVX512F-NEXT:    vmovups %zmm0, 192(%rdi)
261; AVX512F-NEXT:    vmovups %zmm0, 128(%rdi)
262; AVX512F-NEXT:    vmovups %zmm0, 64(%rdi)
263; AVX512F-NEXT:    vmovups %zmm0, (%rdi)
264; AVX512F-NEXT:    vzeroupper
265; AVX512F-NEXT:    retq
266;
267; AVX512BW-LABEL: memset_256_nonzero_bytes:
268; AVX512BW:       # %bb.0:
269; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
270; AVX512BW-NEXT:    vmovups %zmm0, 192(%rdi)
271; AVX512BW-NEXT:    vmovups %zmm0, 128(%rdi)
272; AVX512BW-NEXT:    vmovups %zmm0, 64(%rdi)
273; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
274; AVX512BW-NEXT:    vzeroupper
275; AVX512BW-NEXT:    retq
276  %call = tail call i8* @__memset_chk(i8* %x, i32 42, i64 256, i64 -1)
277  ret void
278}
279
280declare i8* @__memset_chk(i8*, i32, i64, i64)
281
282; Repeat with a non-constant value for the stores.
283
284define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
285; SSE-LABEL: memset_16_nonconst_bytes:
286; SSE:       # %bb.0:
287; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
288; SSE-NEXT:    movzbl %sil, %eax
289; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
290; SSE-NEXT:    imulq %rax, %rcx
291; SSE-NEXT:    movq %rcx, 8(%rdi)
292; SSE-NEXT:    movq %rcx, (%rdi)
293; SSE-NEXT:    retq
294;
295; SSE2FAST-LABEL: memset_16_nonconst_bytes:
296; SSE2FAST:       # %bb.0:
297; SSE2FAST-NEXT:    movd %esi, %xmm0
298; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
299; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
300; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
301; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
302; SSE2FAST-NEXT:    retq
303;
304; AVX1-LABEL: memset_16_nonconst_bytes:
305; AVX1:       # %bb.0:
306; AVX1-NEXT:    vmovd %esi, %xmm0
307; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
308; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
309; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
310; AVX1-NEXT:    retq
311;
312; AVX2-LABEL: memset_16_nonconst_bytes:
313; AVX2:       # %bb.0:
314; AVX2-NEXT:    vmovd %esi, %xmm0
315; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
316; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
317; AVX2-NEXT:    retq
318;
319; AVX512-LABEL: memset_16_nonconst_bytes:
320; AVX512:       # %bb.0:
321; AVX512-NEXT:    vmovd %esi, %xmm0
322; AVX512-NEXT:    vpbroadcastb %xmm0, %xmm0
323; AVX512-NEXT:    vmovdqu %xmm0, (%rdi)
324; AVX512-NEXT:    retq
325  tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 16, i1 false)
326  ret void
327}
328
329define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
330; SSE-LABEL: memset_32_nonconst_bytes:
331; SSE:       # %bb.0:
332; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
333; SSE-NEXT:    movzbl %sil, %eax
334; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
335; SSE-NEXT:    imulq %rax, %rcx
336; SSE-NEXT:    movq %rcx, 24(%rdi)
337; SSE-NEXT:    movq %rcx, 16(%rdi)
338; SSE-NEXT:    movq %rcx, 8(%rdi)
339; SSE-NEXT:    movq %rcx, (%rdi)
340; SSE-NEXT:    retq
341;
342; SSE2FAST-LABEL: memset_32_nonconst_bytes:
343; SSE2FAST:       # %bb.0:
344; SSE2FAST-NEXT:    movd %esi, %xmm0
345; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
346; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
347; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
348; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
349; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
350; SSE2FAST-NEXT:    retq
351;
352; AVX1-LABEL: memset_32_nonconst_bytes:
353; AVX1:       # %bb.0:
354; AVX1-NEXT:    vmovd %esi, %xmm0
355; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
356; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
357; AVX1-NEXT:    vmovdqu %xmm0, 16(%rdi)
358; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
359; AVX1-NEXT:    retq
360;
361; AVX2-LABEL: memset_32_nonconst_bytes:
362; AVX2:       # %bb.0:
363; AVX2-NEXT:    vmovd %esi, %xmm0
364; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
365; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
366; AVX2-NEXT:    vzeroupper
367; AVX2-NEXT:    retq
368;
369; AVX512-LABEL: memset_32_nonconst_bytes:
370; AVX512:       # %bb.0:
371; AVX512-NEXT:    vmovd %esi, %xmm0
372; AVX512-NEXT:    vpbroadcastb %xmm0, %ymm0
373; AVX512-NEXT:    vmovdqu %ymm0, (%rdi)
374; AVX512-NEXT:    vzeroupper
375; AVX512-NEXT:    retq
376  tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 32, i1 false)
377  ret void
378}
379
380define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
381; SSE-LABEL: memset_64_nonconst_bytes:
382; SSE:       # %bb.0:
383; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
384; SSE-NEXT:    movzbl %sil, %eax
385; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
386; SSE-NEXT:    imulq %rax, %rcx
387; SSE-NEXT:    movq %rcx, 56(%rdi)
388; SSE-NEXT:    movq %rcx, 48(%rdi)
389; SSE-NEXT:    movq %rcx, 40(%rdi)
390; SSE-NEXT:    movq %rcx, 32(%rdi)
391; SSE-NEXT:    movq %rcx, 24(%rdi)
392; SSE-NEXT:    movq %rcx, 16(%rdi)
393; SSE-NEXT:    movq %rcx, 8(%rdi)
394; SSE-NEXT:    movq %rcx, (%rdi)
395; SSE-NEXT:    retq
396;
397; SSE2FAST-LABEL: memset_64_nonconst_bytes:
398; SSE2FAST:       # %bb.0:
399; SSE2FAST-NEXT:    movd %esi, %xmm0
400; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
401; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
402; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
403; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
404; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
405; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
406; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
407; SSE2FAST-NEXT:    retq
408;
409; AVX1-LABEL: memset_64_nonconst_bytes:
410; AVX1:       # %bb.0:
411; AVX1-NEXT:    vmovd %esi, %xmm0
412; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
413; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
414; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
415; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
416; AVX1-NEXT:    vmovups %ymm0, (%rdi)
417; AVX1-NEXT:    vzeroupper
418; AVX1-NEXT:    retq
419;
420; AVX2-LABEL: memset_64_nonconst_bytes:
421; AVX2:       # %bb.0:
422; AVX2-NEXT:    vmovd %esi, %xmm0
423; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
424; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
425; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
426; AVX2-NEXT:    vzeroupper
427; AVX2-NEXT:    retq
428;
429; AVX512F-LABEL: memset_64_nonconst_bytes:
430; AVX512F:       # %bb.0:
431; AVX512F-NEXT:    movzbl %sil, %eax
432; AVX512F-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
433; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0
434; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
435; AVX512F-NEXT:    vzeroupper
436; AVX512F-NEXT:    retq
437;
438; AVX512BW-LABEL: memset_64_nonconst_bytes:
439; AVX512BW:       # %bb.0:
440; AVX512BW-NEXT:    vpbroadcastb %esi, %zmm0
441; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rdi)
442; AVX512BW-NEXT:    vzeroupper
443; AVX512BW-NEXT:    retq
444  tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 64, i1 false)
445  ret void
446}
447
448define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
449; SSE-LABEL: memset_128_nonconst_bytes:
450; SSE:       # %bb.0:
451; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
452; SSE-NEXT:    movzbl %sil, %eax
453; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
454; SSE-NEXT:    imulq %rax, %rcx
455; SSE-NEXT:    movq %rcx, 120(%rdi)
456; SSE-NEXT:    movq %rcx, 112(%rdi)
457; SSE-NEXT:    movq %rcx, 104(%rdi)
458; SSE-NEXT:    movq %rcx, 96(%rdi)
459; SSE-NEXT:    movq %rcx, 88(%rdi)
460; SSE-NEXT:    movq %rcx, 80(%rdi)
461; SSE-NEXT:    movq %rcx, 72(%rdi)
462; SSE-NEXT:    movq %rcx, 64(%rdi)
463; SSE-NEXT:    movq %rcx, 56(%rdi)
464; SSE-NEXT:    movq %rcx, 48(%rdi)
465; SSE-NEXT:    movq %rcx, 40(%rdi)
466; SSE-NEXT:    movq %rcx, 32(%rdi)
467; SSE-NEXT:    movq %rcx, 24(%rdi)
468; SSE-NEXT:    movq %rcx, 16(%rdi)
469; SSE-NEXT:    movq %rcx, 8(%rdi)
470; SSE-NEXT:    movq %rcx, (%rdi)
471; SSE-NEXT:    retq
472;
473; SSE2FAST-LABEL: memset_128_nonconst_bytes:
474; SSE2FAST:       # %bb.0:
475; SSE2FAST-NEXT:    movd %esi, %xmm0
476; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
477; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
478; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
479; SSE2FAST-NEXT:    movdqu %xmm0, 112(%rdi)
480; SSE2FAST-NEXT:    movdqu %xmm0, 96(%rdi)
481; SSE2FAST-NEXT:    movdqu %xmm0, 80(%rdi)
482; SSE2FAST-NEXT:    movdqu %xmm0, 64(%rdi)
483; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
484; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
485; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
486; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
487; SSE2FAST-NEXT:    retq
488;
489; AVX1-LABEL: memset_128_nonconst_bytes:
490; AVX1:       # %bb.0:
491; AVX1-NEXT:    vmovd %esi, %xmm0
492; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
493; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
494; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
495; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
496; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
497; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
498; AVX1-NEXT:    vmovups %ymm0, (%rdi)
499; AVX1-NEXT:    vzeroupper
500; AVX1-NEXT:    retq
501;
502; AVX2-LABEL: memset_128_nonconst_bytes:
503; AVX2:       # %bb.0:
504; AVX2-NEXT:    vmovd %esi, %xmm0
505; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
506; AVX2-NEXT:    vmovdqu %ymm0, 96(%rdi)
507; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
508; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
509; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
510; AVX2-NEXT:    vzeroupper
511; AVX2-NEXT:    retq
512;
513; AVX512F-LABEL: memset_128_nonconst_bytes:
514; AVX512F:       # %bb.0:
515; AVX512F-NEXT:    movzbl %sil, %eax
516; AVX512F-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
517; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0
518; AVX512F-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
519; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
520; AVX512F-NEXT:    vzeroupper
521; AVX512F-NEXT:    retq
522;
523; AVX512BW-LABEL: memset_128_nonconst_bytes:
524; AVX512BW:       # %bb.0:
525; AVX512BW-NEXT:    vpbroadcastb %esi, %zmm0
526; AVX512BW-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
527; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rdi)
528; AVX512BW-NEXT:    vzeroupper
529; AVX512BW-NEXT:    retq
530  tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 128, i1 false)
531  ret void
532}
533
534define void @memset_256_nonconst_bytes(i8* %x, i8 %c) {
535; SSE-LABEL: memset_256_nonconst_bytes:
536; SSE:       # %bb.0:
537; SSE-NEXT:    movl $256, %edx # imm = 0x100
538; SSE-NEXT:    jmp memset@PLT # TAILCALL
539;
540; SSE2FAST-LABEL: memset_256_nonconst_bytes:
541; SSE2FAST:       # %bb.0:
542; SSE2FAST-NEXT:    movd %esi, %xmm0
543; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
544; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
545; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
546; SSE2FAST-NEXT:    movdqu %xmm0, 240(%rdi)
547; SSE2FAST-NEXT:    movdqu %xmm0, 224(%rdi)
548; SSE2FAST-NEXT:    movdqu %xmm0, 208(%rdi)
549; SSE2FAST-NEXT:    movdqu %xmm0, 192(%rdi)
550; SSE2FAST-NEXT:    movdqu %xmm0, 176(%rdi)
551; SSE2FAST-NEXT:    movdqu %xmm0, 160(%rdi)
552; SSE2FAST-NEXT:    movdqu %xmm0, 144(%rdi)
553; SSE2FAST-NEXT:    movdqu %xmm0, 128(%rdi)
554; SSE2FAST-NEXT:    movdqu %xmm0, 112(%rdi)
555; SSE2FAST-NEXT:    movdqu %xmm0, 96(%rdi)
556; SSE2FAST-NEXT:    movdqu %xmm0, 80(%rdi)
557; SSE2FAST-NEXT:    movdqu %xmm0, 64(%rdi)
558; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
559; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
560; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
561; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
562; SSE2FAST-NEXT:    retq
563;
564; AVX1-LABEL: memset_256_nonconst_bytes:
565; AVX1:       # %bb.0:
566; AVX1-NEXT:    vmovd %esi, %xmm0
567; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
568; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
569; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
570; AVX1-NEXT:    vmovups %ymm0, 224(%rdi)
571; AVX1-NEXT:    vmovups %ymm0, 192(%rdi)
572; AVX1-NEXT:    vmovups %ymm0, 160(%rdi)
573; AVX1-NEXT:    vmovups %ymm0, 128(%rdi)
574; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
575; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
576; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
577; AVX1-NEXT:    vmovups %ymm0, (%rdi)
578; AVX1-NEXT:    vzeroupper
579; AVX1-NEXT:    retq
580;
581; AVX2-LABEL: memset_256_nonconst_bytes:
582; AVX2:       # %bb.0:
583; AVX2-NEXT:    vmovd %esi, %xmm0
584; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
585; AVX2-NEXT:    vmovdqu %ymm0, 224(%rdi)
586; AVX2-NEXT:    vmovdqu %ymm0, 192(%rdi)
587; AVX2-NEXT:    vmovdqu %ymm0, 160(%rdi)
588; AVX2-NEXT:    vmovdqu %ymm0, 128(%rdi)
589; AVX2-NEXT:    vmovdqu %ymm0, 96(%rdi)
590; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
591; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
592; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
593; AVX2-NEXT:    vzeroupper
594; AVX2-NEXT:    retq
595;
596; AVX512F-LABEL: memset_256_nonconst_bytes:
597; AVX512F:       # %bb.0:
598; AVX512F-NEXT:    movzbl %sil, %eax
599; AVX512F-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
600; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0
601; AVX512F-NEXT:    vmovdqu64 %zmm0, 192(%rdi)
602; AVX512F-NEXT:    vmovdqu64 %zmm0, 128(%rdi)
603; AVX512F-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
604; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
605; AVX512F-NEXT:    vzeroupper
606; AVX512F-NEXT:    retq
607;
608; AVX512BW-LABEL: memset_256_nonconst_bytes:
609; AVX512BW:       # %bb.0:
610; AVX512BW-NEXT:    vpbroadcastb %esi, %zmm0
611; AVX512BW-NEXT:    vmovdqu64 %zmm0, 192(%rdi)
612; AVX512BW-NEXT:    vmovdqu64 %zmm0, 128(%rdi)
613; AVX512BW-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
614; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rdi)
615; AVX512BW-NEXT:    vzeroupper
616; AVX512BW-NEXT:    retq
617  tail call void @llvm.memset.p0i8.i64(i8* %x, i8 %c, i64 256, i1 false)
618  ret void
619}
620
621declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) #1
622
623