1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX8
3; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=4 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX4
4
5define fastcc <4 x i8> @ret_v4i8(<4 x i8>* %p) {
6; CHECK-LABEL: ret_v4i8:
7; CHECK:       # %bb.0:
8; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
9; CHECK-NEXT:    vle8.v v8, (a0)
10; CHECK-NEXT:    ret
11  %v = load <4 x i8>, <4 x i8>* %p
12  ret <4 x i8> %v
13}
14
15define fastcc <4 x i32> @ret_v4i32(<4 x i32>* %p) {
16; CHECK-LABEL: ret_v4i32:
17; CHECK:       # %bb.0:
18; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
19; CHECK-NEXT:    vle32.v v8, (a0)
20; CHECK-NEXT:    ret
21  %v = load <4 x i32>, <4 x i32>* %p
22  ret <4 x i32> %v
23}
24
25define fastcc <8 x i32> @ret_v8i32(<8 x i32>* %p) {
26; CHECK-LABEL: ret_v8i32:
27; CHECK:       # %bb.0:
28; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
29; CHECK-NEXT:    vle32.v v8, (a0)
30; CHECK-NEXT:    ret
31  %v = load <8 x i32>, <8 x i32>* %p
32  ret <8 x i32> %v
33}
34
35define fastcc <16 x i64> @ret_v16i64(<16 x i64>* %p) {
36; LMULMAX8-LABEL: ret_v16i64:
37; LMULMAX8:       # %bb.0:
38; LMULMAX8-NEXT:    vsetivli zero, 16, e64, m8, ta, mu
39; LMULMAX8-NEXT:    vle64.v v8, (a0)
40; LMULMAX8-NEXT:    ret
41;
42; LMULMAX4-LABEL: ret_v16i64:
43; LMULMAX4:       # %bb.0:
44; LMULMAX4-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
45; LMULMAX4-NEXT:    vle64.v v8, (a0)
46; LMULMAX4-NEXT:    addi a0, a0, 64
47; LMULMAX4-NEXT:    vle64.v v12, (a0)
48; LMULMAX4-NEXT:    ret
49  %v = load <16 x i64>, <16 x i64>* %p
50  ret <16 x i64> %v
51}
52
53define fastcc <8 x i1> @ret_mask_v8i1(<8 x i1>* %p) {
54; CHECK-LABEL: ret_mask_v8i1:
55; CHECK:       # %bb.0:
56; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
57; CHECK-NEXT:    vle1.v v0, (a0)
58; CHECK-NEXT:    ret
59  %v = load <8 x i1>, <8 x i1>* %p
60  ret <8 x i1> %v
61}
62
63define fastcc <32 x i1> @ret_mask_v32i1(<32 x i1>* %p) {
64; CHECK-LABEL: ret_mask_v32i1:
65; CHECK:       # %bb.0:
66; CHECK-NEXT:    addi a1, zero, 32
67; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
68; CHECK-NEXT:    vle1.v v0, (a0)
69; CHECK-NEXT:    ret
70  %v = load <32 x i1>, <32 x i1>* %p
71  ret <32 x i1> %v
72}
73
74; Return the vector via registers v8-v23
75define fastcc <64 x i32> @ret_split_v64i32(<64 x i32>* %x) {
76; LMULMAX8-LABEL: ret_split_v64i32:
77; LMULMAX8:       # %bb.0:
78; LMULMAX8-NEXT:    addi a1, zero, 32
79; LMULMAX8-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
80; LMULMAX8-NEXT:    vle32.v v8, (a0)
81; LMULMAX8-NEXT:    addi a0, a0, 128
82; LMULMAX8-NEXT:    vle32.v v16, (a0)
83; LMULMAX8-NEXT:    ret
84;
85; LMULMAX4-LABEL: ret_split_v64i32:
86; LMULMAX4:       # %bb.0:
87; LMULMAX4-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
88; LMULMAX4-NEXT:    vle32.v v8, (a0)
89; LMULMAX4-NEXT:    addi a1, a0, 64
90; LMULMAX4-NEXT:    vle32.v v12, (a1)
91; LMULMAX4-NEXT:    addi a1, a0, 128
92; LMULMAX4-NEXT:    vle32.v v16, (a1)
93; LMULMAX4-NEXT:    addi a0, a0, 192
94; LMULMAX4-NEXT:    vle32.v v20, (a0)
95; LMULMAX4-NEXT:    ret
96  %v = load <64 x i32>, <64 x i32>* %x
97  ret <64 x i32> %v
98}
99
100; Return the vector fully via the stack
101define fastcc <128 x i32> @ret_split_v128i32(<128 x i32>* %x) {
102; LMULMAX8-LABEL: ret_split_v128i32:
103; LMULMAX8:       # %bb.0:
104; LMULMAX8-NEXT:    addi a2, zero, 32
105; LMULMAX8-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
106; LMULMAX8-NEXT:    vle32.v v8, (a1)
107; LMULMAX8-NEXT:    addi a2, a1, 128
108; LMULMAX8-NEXT:    vle32.v v16, (a2)
109; LMULMAX8-NEXT:    addi a2, a1, 384
110; LMULMAX8-NEXT:    vle32.v v24, (a2)
111; LMULMAX8-NEXT:    addi a1, a1, 256
112; LMULMAX8-NEXT:    vle32.v v0, (a1)
113; LMULMAX8-NEXT:    addi a1, a0, 384
114; LMULMAX8-NEXT:    vse32.v v24, (a1)
115; LMULMAX8-NEXT:    addi a1, a0, 256
116; LMULMAX8-NEXT:    vse32.v v0, (a1)
117; LMULMAX8-NEXT:    addi a1, a0, 128
118; LMULMAX8-NEXT:    vse32.v v16, (a1)
119; LMULMAX8-NEXT:    vse32.v v8, (a0)
120; LMULMAX8-NEXT:    ret
121;
122; LMULMAX4-LABEL: ret_split_v128i32:
123; LMULMAX4:       # %bb.0:
124; LMULMAX4-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
125; LMULMAX4-NEXT:    vle32.v v28, (a1)
126; LMULMAX4-NEXT:    addi a2, a1, 64
127; LMULMAX4-NEXT:    vle32.v v8, (a2)
128; LMULMAX4-NEXT:    addi a2, a1, 128
129; LMULMAX4-NEXT:    vle32.v v12, (a2)
130; LMULMAX4-NEXT:    addi a2, a1, 192
131; LMULMAX4-NEXT:    vle32.v v16, (a2)
132; LMULMAX4-NEXT:    addi a2, a1, 256
133; LMULMAX4-NEXT:    vle32.v v20, (a2)
134; LMULMAX4-NEXT:    addi a2, a1, 320
135; LMULMAX4-NEXT:    vle32.v v24, (a2)
136; LMULMAX4-NEXT:    addi a2, a1, 448
137; LMULMAX4-NEXT:    vle32.v v0, (a2)
138; LMULMAX4-NEXT:    addi a1, a1, 384
139; LMULMAX4-NEXT:    vle32.v v4, (a1)
140; LMULMAX4-NEXT:    addi a1, a0, 448
141; LMULMAX4-NEXT:    vse32.v v0, (a1)
142; LMULMAX4-NEXT:    addi a1, a0, 384
143; LMULMAX4-NEXT:    vse32.v v4, (a1)
144; LMULMAX4-NEXT:    addi a1, a0, 320
145; LMULMAX4-NEXT:    vse32.v v24, (a1)
146; LMULMAX4-NEXT:    addi a1, a0, 256
147; LMULMAX4-NEXT:    vse32.v v20, (a1)
148; LMULMAX4-NEXT:    addi a1, a0, 192
149; LMULMAX4-NEXT:    vse32.v v16, (a1)
150; LMULMAX4-NEXT:    addi a1, a0, 128
151; LMULMAX4-NEXT:    vse32.v v12, (a1)
152; LMULMAX4-NEXT:    addi a1, a0, 64
153; LMULMAX4-NEXT:    vse32.v v8, (a1)
154; LMULMAX4-NEXT:    vse32.v v28, (a0)
155; LMULMAX4-NEXT:    ret
156  %v = load <128 x i32>, <128 x i32>* %x
157  ret <128 x i32> %v
158}
159
160define fastcc <4 x i8> @ret_v8i8_param_v4i8(<4 x i8> %v) {
161; CHECK-LABEL: ret_v8i8_param_v4i8:
162; CHECK:       # %bb.0:
163; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
164; CHECK-NEXT:    vadd.vi v8, v8, 2
165; CHECK-NEXT:    ret
166  %r = add <4 x i8> %v, <i8 2, i8 2, i8 2, i8 2>
167  ret <4 x i8> %r
168}
169
170define fastcc <4 x i8> @ret_v4i8_param_v4i8_v4i8(<4 x i8> %v, <4 x i8> %w) {
171; CHECK-LABEL: ret_v4i8_param_v4i8_v4i8:
172; CHECK:       # %bb.0:
173; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
174; CHECK-NEXT:    vadd.vv v8, v8, v9
175; CHECK-NEXT:    ret
176  %r = add <4 x i8> %v, %w
177  ret <4 x i8> %r
178}
179
180define fastcc <4 x i64> @ret_v4i64_param_v4i64_v4i64(<4 x i64> %v, <4 x i64> %w) {
181; CHECK-LABEL: ret_v4i64_param_v4i64_v4i64:
182; CHECK:       # %bb.0:
183; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
184; CHECK-NEXT:    vadd.vv v8, v8, v10
185; CHECK-NEXT:    ret
186  %r = add <4 x i64> %v, %w
187  ret <4 x i64> %r
188}
189
190define fastcc <8 x i1> @ret_v8i1_param_v8i1_v8i1(<8 x i1> %v, <8 x i1> %w) {
191; CHECK-LABEL: ret_v8i1_param_v8i1_v8i1:
192; CHECK:       # %bb.0:
193; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
194; CHECK-NEXT:    vmxor.mm v0, v0, v8
195; CHECK-NEXT:    ret
196  %r = xor <8 x i1> %v, %w
197  ret <8 x i1> %r
198}
199
200define fastcc <32 x i1> @ret_v32i1_param_v32i1_v32i1(<32 x i1> %v, <32 x i1> %w) {
201; CHECK-LABEL: ret_v32i1_param_v32i1_v32i1:
202; CHECK:       # %bb.0:
203; CHECK-NEXT:    addi a0, zero, 32
204; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
205; CHECK-NEXT:    vmand.mm v0, v0, v8
206; CHECK-NEXT:    ret
207  %r = and <32 x i1> %v, %w
208  ret <32 x i1> %r
209}
210
211define fastcc <32 x i32> @ret_v32i32_param_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) {
212; LMULMAX8-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32:
213; LMULMAX8:       # %bb.0:
214; LMULMAX8-NEXT:    addi a2, zero, 32
215; LMULMAX8-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
216; LMULMAX8-NEXT:    vle32.v v24, (a0)
217; LMULMAX8-NEXT:    vadd.vv v8, v8, v16
218; LMULMAX8-NEXT:    vadd.vv v8, v8, v24
219; LMULMAX8-NEXT:    vadd.vx v8, v8, a1
220; LMULMAX8-NEXT:    ret
221;
222; LMULMAX4-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32:
223; LMULMAX4:       # %bb.0:
224; LMULMAX4-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
225; LMULMAX4-NEXT:    addi a1, a0, 64
226; LMULMAX4-NEXT:    vle32.v v28, (a1)
227; LMULMAX4-NEXT:    vle32.v v24, (a0)
228; LMULMAX4-NEXT:    vadd.vv v8, v8, v16
229; LMULMAX4-NEXT:    vadd.vv v12, v12, v20
230; LMULMAX4-NEXT:    vadd.vv v28, v12, v28
231; LMULMAX4-NEXT:    vadd.vv v8, v8, v24
232; LMULMAX4-NEXT:    vadd.vx v8, v8, a2
233; LMULMAX4-NEXT:    vadd.vx v12, v28, a2
234; LMULMAX4-NEXT:    ret
235  %r = add <32 x i32> %x, %y
236  %s = add <32 x i32> %r, %z
237  %head = insertelement <32 x i32> undef, i32 %w, i32 0
238  %splat = shufflevector <32 x i32> %head, <32 x i32> undef, <32 x i32> zeroinitializer
239  %t = add <32 x i32> %s, %splat
240  ret <32 x i32> %t
241}
242
243declare <32 x i32> @ext2(<32 x i32>, <32 x i32>, i32, i32)
244declare <32 x i32> @ext3(<32 x i32>, <32 x i32>, <32 x i32>, i32, i32)
245
246define fastcc <32 x i32> @ret_v32i32_call_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, i32 %w) {
247; LMULMAX8-LABEL: ret_v32i32_call_v32i32_v32i32_i32:
248; LMULMAX8:       # %bb.0:
249; LMULMAX8-NEXT:    addi sp, sp, -16
250; LMULMAX8-NEXT:    .cfi_def_cfa_offset 16
251; LMULMAX8-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
252; LMULMAX8-NEXT:    .cfi_offset ra, -8
253; LMULMAX8-NEXT:    vmv8r.v v24, v8
254; LMULMAX8-NEXT:    addi a1, zero, 2
255; LMULMAX8-NEXT:    vmv8r.v v8, v16
256; LMULMAX8-NEXT:    vmv8r.v v16, v24
257; LMULMAX8-NEXT:    call ext2@plt
258; LMULMAX8-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
259; LMULMAX8-NEXT:    addi sp, sp, 16
260; LMULMAX8-NEXT:    ret
261;
262; LMULMAX4-LABEL: ret_v32i32_call_v32i32_v32i32_i32:
263; LMULMAX4:       # %bb.0:
264; LMULMAX4-NEXT:    addi sp, sp, -16
265; LMULMAX4-NEXT:    .cfi_def_cfa_offset 16
266; LMULMAX4-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
267; LMULMAX4-NEXT:    .cfi_offset ra, -8
268; LMULMAX4-NEXT:    vmv4r.v v28, v12
269; LMULMAX4-NEXT:    vmv4r.v v24, v8
270; LMULMAX4-NEXT:    addi a1, zero, 2
271; LMULMAX4-NEXT:    vmv4r.v v8, v16
272; LMULMAX4-NEXT:    vmv4r.v v12, v20
273; LMULMAX4-NEXT:    vmv4r.v v16, v24
274; LMULMAX4-NEXT:    vmv4r.v v20, v28
275; LMULMAX4-NEXT:    call ext2@plt
276; LMULMAX4-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
277; LMULMAX4-NEXT:    addi sp, sp, 16
278; LMULMAX4-NEXT:    ret
279  %t = call fastcc <32 x i32> @ext2(<32 x i32> %y, <32 x i32> %x, i32 %w, i32 2)
280  ret <32 x i32> %t
281}
282
283define fastcc <32 x i32> @ret_v32i32_call_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) {
284; LMULMAX8-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32:
285; LMULMAX8:       # %bb.0:
286; LMULMAX8-NEXT:    addi sp, sp, -384
287; LMULMAX8-NEXT:    .cfi_def_cfa_offset 384
288; LMULMAX8-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
289; LMULMAX8-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
290; LMULMAX8-NEXT:    .cfi_offset ra, -8
291; LMULMAX8-NEXT:    .cfi_offset s0, -16
292; LMULMAX8-NEXT:    addi s0, sp, 384
293; LMULMAX8-NEXT:    .cfi_def_cfa s0, 0
294; LMULMAX8-NEXT:    andi sp, sp, -128
295; LMULMAX8-NEXT:    addi a2, zero, 32
296; LMULMAX8-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
297; LMULMAX8-NEXT:    vle32.v v24, (a0)
298; LMULMAX8-NEXT:    addi a0, sp, 128
299; LMULMAX8-NEXT:    addi a2, zero, 42
300; LMULMAX8-NEXT:    addi a3, sp, 128
301; LMULMAX8-NEXT:    vse32.v v8, (a3)
302; LMULMAX8-NEXT:    vmv8r.v v8, v24
303; LMULMAX8-NEXT:    call ext3@plt
304; LMULMAX8-NEXT:    addi sp, s0, -384
305; LMULMAX8-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
306; LMULMAX8-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
307; LMULMAX8-NEXT:    addi sp, sp, 384
308; LMULMAX8-NEXT:    ret
309;
310; LMULMAX4-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32:
311; LMULMAX4:       # %bb.0:
312; LMULMAX4-NEXT:    addi sp, sp, -384
313; LMULMAX4-NEXT:    .cfi_def_cfa_offset 384
314; LMULMAX4-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
315; LMULMAX4-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
316; LMULMAX4-NEXT:    .cfi_offset ra, -8
317; LMULMAX4-NEXT:    .cfi_offset s0, -16
318; LMULMAX4-NEXT:    addi s0, sp, 384
319; LMULMAX4-NEXT:    .cfi_def_cfa s0, 0
320; LMULMAX4-NEXT:    andi sp, sp, -128
321; LMULMAX4-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
322; LMULMAX4-NEXT:    vle32.v v28, (a0)
323; LMULMAX4-NEXT:    addi a0, a0, 64
324; LMULMAX4-NEXT:    vle32.v v24, (a0)
325; LMULMAX4-NEXT:    addi a0, sp, 192
326; LMULMAX4-NEXT:    vse32.v v12, (a0)
327; LMULMAX4-NEXT:    addi a0, sp, 128
328; LMULMAX4-NEXT:    addi a3, zero, 42
329; LMULMAX4-NEXT:    addi a1, sp, 128
330; LMULMAX4-NEXT:    vse32.v v8, (a1)
331; LMULMAX4-NEXT:    vmv4r.v v8, v28
332; LMULMAX4-NEXT:    vmv4r.v v12, v24
333; LMULMAX4-NEXT:    call ext3@plt
334; LMULMAX4-NEXT:    addi sp, s0, -384
335; LMULMAX4-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
336; LMULMAX4-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
337; LMULMAX4-NEXT:    addi sp, sp, 384
338; LMULMAX4-NEXT:    ret
339  %t = call fastcc <32 x i32> @ext3(<32 x i32> %z, <32 x i32> %y, <32 x i32> %x, i32 %w, i32 42)
340  ret <32 x i32> %t
341}
342
343; A test case where the normal calling convention would pass directly via the
344; stack, but with fastcc can pass indirectly with the extra GPR registers
345; allowed.
346define fastcc <32 x i32> @vector_arg_indirect_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %8) {
347; LMULMAX8-LABEL: vector_arg_indirect_stack:
348; LMULMAX8:       # %bb.0:
349; LMULMAX8-NEXT:    addi a0, zero, 32
350; LMULMAX8-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
351; LMULMAX8-NEXT:    vle32.v v16, (t2)
352; LMULMAX8-NEXT:    vadd.vv v8, v8, v16
353; LMULMAX8-NEXT:    ret
354;
355; LMULMAX4-LABEL: vector_arg_indirect_stack:
356; LMULMAX4:       # %bb.0:
357; LMULMAX4-NEXT:    addi a0, t2, 64
358; LMULMAX4-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
359; LMULMAX4-NEXT:    vle32.v v28, (t2)
360; LMULMAX4-NEXT:    vle32.v v16, (a0)
361; LMULMAX4-NEXT:    vadd.vv v8, v8, v28
362; LMULMAX4-NEXT:    vadd.vv v12, v12, v16
363; LMULMAX4-NEXT:    ret
364  %s = add <32 x i32> %x, %z
365  ret <32 x i32> %s
366}
367
368; Calling the function above. Ensure we pass the arguments correctly.
369define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) {
370; LMULMAX8-LABEL: pass_vector_arg_indirect_stack:
371; LMULMAX8:       # %bb.0:
372; LMULMAX8-NEXT:    addi sp, sp, -384
373; LMULMAX8-NEXT:    .cfi_def_cfa_offset 384
374; LMULMAX8-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
375; LMULMAX8-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
376; LMULMAX8-NEXT:    .cfi_offset ra, -8
377; LMULMAX8-NEXT:    .cfi_offset s0, -16
378; LMULMAX8-NEXT:    addi s0, sp, 384
379; LMULMAX8-NEXT:    .cfi_def_cfa s0, 0
380; LMULMAX8-NEXT:    andi sp, sp, -128
381; LMULMAX8-NEXT:    addi a0, zero, 32
382; LMULMAX8-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
383; LMULMAX8-NEXT:    vmv.v.i v8, 0
384; LMULMAX8-NEXT:    addi a1, zero, 1
385; LMULMAX8-NEXT:    addi a2, zero, 2
386; LMULMAX8-NEXT:    addi a3, zero, 3
387; LMULMAX8-NEXT:    addi a4, zero, 4
388; LMULMAX8-NEXT:    addi a5, zero, 5
389; LMULMAX8-NEXT:    addi a6, zero, 6
390; LMULMAX8-NEXT:    addi a7, zero, 7
391; LMULMAX8-NEXT:    addi t2, sp, 128
392; LMULMAX8-NEXT:    addi t3, zero, 8
393; LMULMAX8-NEXT:    addi a0, sp, 128
394; LMULMAX8-NEXT:    vse32.v v8, (a0)
395; LMULMAX8-NEXT:    mv a0, zero
396; LMULMAX8-NEXT:    vmv8r.v v16, v8
397; LMULMAX8-NEXT:    call vector_arg_indirect_stack@plt
398; LMULMAX8-NEXT:    addi sp, s0, -384
399; LMULMAX8-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
400; LMULMAX8-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
401; LMULMAX8-NEXT:    addi sp, sp, 384
402; LMULMAX8-NEXT:    ret
403;
404; LMULMAX4-LABEL: pass_vector_arg_indirect_stack:
405; LMULMAX4:       # %bb.0:
406; LMULMAX4-NEXT:    addi sp, sp, -384
407; LMULMAX4-NEXT:    .cfi_def_cfa_offset 384
408; LMULMAX4-NEXT:    sd ra, 376(sp) # 8-byte Folded Spill
409; LMULMAX4-NEXT:    sd s0, 368(sp) # 8-byte Folded Spill
410; LMULMAX4-NEXT:    .cfi_offset ra, -8
411; LMULMAX4-NEXT:    .cfi_offset s0, -16
412; LMULMAX4-NEXT:    addi s0, sp, 384
413; LMULMAX4-NEXT:    .cfi_def_cfa s0, 0
414; LMULMAX4-NEXT:    andi sp, sp, -128
415; LMULMAX4-NEXT:    addi a0, sp, 192
416; LMULMAX4-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
417; LMULMAX4-NEXT:    vmv.v.i v8, 0
418; LMULMAX4-NEXT:    vse32.v v8, (a0)
419; LMULMAX4-NEXT:    addi a1, zero, 1
420; LMULMAX4-NEXT:    addi a2, zero, 2
421; LMULMAX4-NEXT:    addi a3, zero, 3
422; LMULMAX4-NEXT:    addi a4, zero, 4
423; LMULMAX4-NEXT:    addi a5, zero, 5
424; LMULMAX4-NEXT:    addi a6, zero, 6
425; LMULMAX4-NEXT:    addi a7, zero, 7
426; LMULMAX4-NEXT:    addi t2, sp, 128
427; LMULMAX4-NEXT:    addi t4, zero, 8
428; LMULMAX4-NEXT:    addi a0, sp, 128
429; LMULMAX4-NEXT:    vse32.v v8, (a0)
430; LMULMAX4-NEXT:    mv a0, zero
431; LMULMAX4-NEXT:    vmv4r.v v12, v8
432; LMULMAX4-NEXT:    vmv4r.v v16, v8
433; LMULMAX4-NEXT:    vmv4r.v v20, v8
434; LMULMAX4-NEXT:    call vector_arg_indirect_stack@plt
435; LMULMAX4-NEXT:    addi sp, s0, -384
436; LMULMAX4-NEXT:    ld s0, 368(sp) # 8-byte Folded Reload
437; LMULMAX4-NEXT:    ld ra, 376(sp) # 8-byte Folded Reload
438; LMULMAX4-NEXT:    addi sp, sp, 384
439; LMULMAX4-NEXT:    ret
440  %s = call fastcc <32 x i32> @vector_arg_indirect_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 8)
441  ret <32 x i32> %s
442}
443
444; A pathological test case where even with fastcc we must use the stack for arguments %13 and %z
445define fastcc <32 x i32> @vector_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %last) {
446; LMULMAX8-LABEL: vector_arg_direct_stack:
447; LMULMAX8:       # %bb.0:
448; LMULMAX8-NEXT:    addi sp, sp, -16
449; LMULMAX8-NEXT:    .cfi_def_cfa_offset 16
450; LMULMAX8-NEXT:    addi a0, zero, 32
451; LMULMAX8-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
452; LMULMAX8-NEXT:    addi a0, sp, 24
453; LMULMAX8-NEXT:    vle32.v v24, (a0)
454; LMULMAX8-NEXT:    vadd.vv v8, v8, v16
455; LMULMAX8-NEXT:    vadd.vv v8, v8, v24
456; LMULMAX8-NEXT:    addi sp, sp, 16
457; LMULMAX8-NEXT:    ret
458;
459; LMULMAX4-LABEL: vector_arg_direct_stack:
460; LMULMAX4:       # %bb.0:
461; LMULMAX4-NEXT:    addi sp, sp, -16
462; LMULMAX4-NEXT:    .cfi_def_cfa_offset 16
463; LMULMAX4-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
464; LMULMAX4-NEXT:    addi a0, sp, 24
465; LMULMAX4-NEXT:    vle32.v v28, (a0)
466; LMULMAX4-NEXT:    addi a0, sp, 88
467; LMULMAX4-NEXT:    vle32.v v24, (a0)
468; LMULMAX4-NEXT:    vadd.vv v12, v12, v20
469; LMULMAX4-NEXT:    vadd.vv v8, v8, v16
470; LMULMAX4-NEXT:    vadd.vv v8, v8, v28
471; LMULMAX4-NEXT:    vadd.vv v12, v12, v24
472; LMULMAX4-NEXT:    addi sp, sp, 16
473; LMULMAX4-NEXT:    ret
474  %s = add <32 x i32> %x, %y
475  %t = add <32 x i32> %s, %z
476  ret <32 x i32> %t
477}
478
479; Calling the function above. Ensure we pass the arguments correctly.
480define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) {
481; LMULMAX8-LABEL: pass_vector_arg_direct_stack:
482; LMULMAX8:       # %bb.0:
483; LMULMAX8-NEXT:    addi sp, sp, -160
484; LMULMAX8-NEXT:    .cfi_def_cfa_offset 160
485; LMULMAX8-NEXT:    sd ra, 152(sp) # 8-byte Folded Spill
486; LMULMAX8-NEXT:    .cfi_offset ra, -8
487; LMULMAX8-NEXT:    addi a0, zero, 32
488; LMULMAX8-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
489; LMULMAX8-NEXT:    vmv.v.i v8, 0
490; LMULMAX8-NEXT:    addi a0, sp, 8
491; LMULMAX8-NEXT:    vse32.v v8, (a0)
492; LMULMAX8-NEXT:    addi a0, zero, 1
493; LMULMAX8-NEXT:    sd a0, 136(sp)
494; LMULMAX8-NEXT:    addi a0, zero, 13
495; LMULMAX8-NEXT:    addi a1, zero, 1
496; LMULMAX8-NEXT:    addi a2, zero, 2
497; LMULMAX8-NEXT:    addi a3, zero, 3
498; LMULMAX8-NEXT:    addi a4, zero, 4
499; LMULMAX8-NEXT:    addi a5, zero, 5
500; LMULMAX8-NEXT:    addi a6, zero, 6
501; LMULMAX8-NEXT:    addi a7, zero, 7
502; LMULMAX8-NEXT:    addi t2, zero, 8
503; LMULMAX8-NEXT:    addi t3, zero, 9
504; LMULMAX8-NEXT:    addi t4, zero, 10
505; LMULMAX8-NEXT:    addi t5, zero, 11
506; LMULMAX8-NEXT:    addi t6, zero, 12
507; LMULMAX8-NEXT:    sd a0, 0(sp)
508; LMULMAX8-NEXT:    mv a0, zero
509; LMULMAX8-NEXT:    vmv8r.v v16, v8
510; LMULMAX8-NEXT:    call vector_arg_direct_stack@plt
511; LMULMAX8-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
512; LMULMAX8-NEXT:    addi sp, sp, 160
513; LMULMAX8-NEXT:    ret
514;
515; LMULMAX4-LABEL: pass_vector_arg_direct_stack:
516; LMULMAX4:       # %bb.0:
517; LMULMAX4-NEXT:    addi sp, sp, -160
518; LMULMAX4-NEXT:    .cfi_def_cfa_offset 160
519; LMULMAX4-NEXT:    sd ra, 152(sp) # 8-byte Folded Spill
520; LMULMAX4-NEXT:    .cfi_offset ra, -8
521; LMULMAX4-NEXT:    addi a0, zero, 1
522; LMULMAX4-NEXT:    sd a0, 136(sp)
523; LMULMAX4-NEXT:    addi a0, zero, 13
524; LMULMAX4-NEXT:    sd a0, 0(sp)
525; LMULMAX4-NEXT:    addi a0, sp, 72
526; LMULMAX4-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
527; LMULMAX4-NEXT:    vmv.v.i v8, 0
528; LMULMAX4-NEXT:    vse32.v v8, (a0)
529; LMULMAX4-NEXT:    addi a0, sp, 8
530; LMULMAX4-NEXT:    addi a1, zero, 1
531; LMULMAX4-NEXT:    addi a2, zero, 2
532; LMULMAX4-NEXT:    addi a3, zero, 3
533; LMULMAX4-NEXT:    addi a4, zero, 4
534; LMULMAX4-NEXT:    addi a5, zero, 5
535; LMULMAX4-NEXT:    addi a6, zero, 6
536; LMULMAX4-NEXT:    addi a7, zero, 7
537; LMULMAX4-NEXT:    addi t2, zero, 8
538; LMULMAX4-NEXT:    addi t3, zero, 9
539; LMULMAX4-NEXT:    addi t4, zero, 10
540; LMULMAX4-NEXT:    addi t5, zero, 11
541; LMULMAX4-NEXT:    addi t6, zero, 12
542; LMULMAX4-NEXT:    vse32.v v8, (a0)
543; LMULMAX4-NEXT:    mv a0, zero
544; LMULMAX4-NEXT:    vmv4r.v v12, v8
545; LMULMAX4-NEXT:    vmv4r.v v16, v8
546; LMULMAX4-NEXT:    vmv4r.v v20, v8
547; LMULMAX4-NEXT:    call vector_arg_direct_stack@plt
548; LMULMAX4-NEXT:    ld ra, 152(sp) # 8-byte Folded Reload
549; LMULMAX4-NEXT:    addi sp, sp, 160
550; LMULMAX4-NEXT:    ret
551  %s = call fastcc <32 x i32> @vector_arg_direct_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 1)
552  ret <32 x i32> %s
553}
554
555; A pathological test case where even with fastcc we must use the stack for
556; mask argument %m2. %m1 is passed via v0.
557define fastcc <4 x i1> @vector_mask_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, <4 x i1> %m1, <4 x i1> %m2, i32 %last) {
558; CHECK-LABEL: vector_mask_arg_direct_stack:
559; CHECK:       # %bb.0:
560; CHECK-NEXT:    addi sp, sp, -16
561; CHECK-NEXT:    .cfi_def_cfa_offset 16
562; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
563; CHECK-NEXT:    addi a0, sp, 152
564; CHECK-NEXT:    vle1.v v25, (a0)
565; CHECK-NEXT:    vmxor.mm v0, v0, v25
566; CHECK-NEXT:    addi sp, sp, 16
567; CHECK-NEXT:    ret
568  %r = xor <4 x i1> %m1, %m2
569  ret <4 x i1> %r
570}
571