1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX8 3; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=4 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX4 4 5define fastcc <4 x i8> @ret_v4i8(<4 x i8>* %p) { 6; CHECK-LABEL: ret_v4i8: 7; CHECK: # %bb.0: 8; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu 9; CHECK-NEXT: vle8.v v8, (a0) 10; CHECK-NEXT: ret 11 %v = load <4 x i8>, <4 x i8>* %p 12 ret <4 x i8> %v 13} 14 15define fastcc <4 x i32> @ret_v4i32(<4 x i32>* %p) { 16; CHECK-LABEL: ret_v4i32: 17; CHECK: # %bb.0: 18; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu 19; CHECK-NEXT: vle32.v v8, (a0) 20; CHECK-NEXT: ret 21 %v = load <4 x i32>, <4 x i32>* %p 22 ret <4 x i32> %v 23} 24 25define fastcc <8 x i32> @ret_v8i32(<8 x i32>* %p) { 26; CHECK-LABEL: ret_v8i32: 27; CHECK: # %bb.0: 28; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu 29; CHECK-NEXT: vle32.v v8, (a0) 30; CHECK-NEXT: ret 31 %v = load <8 x i32>, <8 x i32>* %p 32 ret <8 x i32> %v 33} 34 35define fastcc <16 x i64> @ret_v16i64(<16 x i64>* %p) { 36; LMULMAX8-LABEL: ret_v16i64: 37; LMULMAX8: # %bb.0: 38; LMULMAX8-NEXT: vsetivli zero, 16, e64, m8, ta, mu 39; LMULMAX8-NEXT: vle64.v v8, (a0) 40; LMULMAX8-NEXT: ret 41; 42; LMULMAX4-LABEL: ret_v16i64: 43; LMULMAX4: # %bb.0: 44; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, mu 45; LMULMAX4-NEXT: vle64.v v8, (a0) 46; LMULMAX4-NEXT: addi a0, a0, 64 47; LMULMAX4-NEXT: vle64.v v12, (a0) 48; LMULMAX4-NEXT: ret 49 %v = load <16 x i64>, <16 x i64>* %p 50 ret <16 x i64> %v 51} 52 53define fastcc <8 x i1> @ret_mask_v8i1(<8 x i1>* %p) { 54; CHECK-LABEL: ret_mask_v8i1: 55; CHECK: # %bb.0: 56; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu 57; CHECK-NEXT: vle1.v v0, (a0) 58; CHECK-NEXT: ret 59 %v = load <8 x i1>, <8 x i1>* %p 60 ret <8 x i1> %v 61} 62 63define fastcc <32 x i1> @ret_mask_v32i1(<32 x i1>* %p) { 64; CHECK-LABEL: ret_mask_v32i1: 65; CHECK: # %bb.0: 66; CHECK-NEXT: addi a1, zero, 32 67; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu 68; CHECK-NEXT: vle1.v v0, (a0) 69; CHECK-NEXT: ret 70 %v = load <32 x i1>, <32 x i1>* %p 71 ret <32 x i1> %v 72} 73 74; Return the vector via registers v8-v23 75define fastcc <64 x i32> @ret_split_v64i32(<64 x i32>* %x) { 76; LMULMAX8-LABEL: ret_split_v64i32: 77; LMULMAX8: # %bb.0: 78; LMULMAX8-NEXT: addi a1, zero, 32 79; LMULMAX8-NEXT: vsetvli zero, a1, e32, m8, ta, mu 80; LMULMAX8-NEXT: vle32.v v8, (a0) 81; LMULMAX8-NEXT: addi a0, a0, 128 82; LMULMAX8-NEXT: vle32.v v16, (a0) 83; LMULMAX8-NEXT: ret 84; 85; LMULMAX4-LABEL: ret_split_v64i32: 86; LMULMAX4: # %bb.0: 87; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, mu 88; LMULMAX4-NEXT: vle32.v v8, (a0) 89; LMULMAX4-NEXT: addi a1, a0, 64 90; LMULMAX4-NEXT: vle32.v v12, (a1) 91; LMULMAX4-NEXT: addi a1, a0, 128 92; LMULMAX4-NEXT: vle32.v v16, (a1) 93; LMULMAX4-NEXT: addi a0, a0, 192 94; LMULMAX4-NEXT: vle32.v v20, (a0) 95; LMULMAX4-NEXT: ret 96 %v = load <64 x i32>, <64 x i32>* %x 97 ret <64 x i32> %v 98} 99 100; Return the vector fully via the stack 101define fastcc <128 x i32> @ret_split_v128i32(<128 x i32>* %x) { 102; LMULMAX8-LABEL: ret_split_v128i32: 103; LMULMAX8: # %bb.0: 104; LMULMAX8-NEXT: addi a2, zero, 32 105; LMULMAX8-NEXT: vsetvli zero, a2, e32, m8, ta, mu 106; LMULMAX8-NEXT: vle32.v v8, (a1) 107; LMULMAX8-NEXT: addi a2, a1, 128 108; LMULMAX8-NEXT: vle32.v v16, (a2) 109; LMULMAX8-NEXT: addi a2, a1, 384 110; LMULMAX8-NEXT: vle32.v v24, (a2) 111; LMULMAX8-NEXT: addi a1, a1, 256 112; LMULMAX8-NEXT: vle32.v v0, (a1) 113; LMULMAX8-NEXT: addi a1, a0, 384 114; LMULMAX8-NEXT: vse32.v v24, (a1) 115; LMULMAX8-NEXT: addi a1, a0, 256 116; LMULMAX8-NEXT: vse32.v v0, (a1) 117; LMULMAX8-NEXT: addi a1, a0, 128 118; LMULMAX8-NEXT: vse32.v v16, (a1) 119; LMULMAX8-NEXT: vse32.v v8, (a0) 120; LMULMAX8-NEXT: ret 121; 122; LMULMAX4-LABEL: ret_split_v128i32: 123; LMULMAX4: # %bb.0: 124; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, mu 125; LMULMAX4-NEXT: vle32.v v28, (a1) 126; LMULMAX4-NEXT: addi a2, a1, 64 127; LMULMAX4-NEXT: vle32.v v8, (a2) 128; LMULMAX4-NEXT: addi a2, a1, 128 129; LMULMAX4-NEXT: vle32.v v12, (a2) 130; LMULMAX4-NEXT: addi a2, a1, 192 131; LMULMAX4-NEXT: vle32.v v16, (a2) 132; LMULMAX4-NEXT: addi a2, a1, 256 133; LMULMAX4-NEXT: vle32.v v20, (a2) 134; LMULMAX4-NEXT: addi a2, a1, 320 135; LMULMAX4-NEXT: vle32.v v24, (a2) 136; LMULMAX4-NEXT: addi a2, a1, 448 137; LMULMAX4-NEXT: vle32.v v0, (a2) 138; LMULMAX4-NEXT: addi a1, a1, 384 139; LMULMAX4-NEXT: vle32.v v4, (a1) 140; LMULMAX4-NEXT: addi a1, a0, 448 141; LMULMAX4-NEXT: vse32.v v0, (a1) 142; LMULMAX4-NEXT: addi a1, a0, 384 143; LMULMAX4-NEXT: vse32.v v4, (a1) 144; LMULMAX4-NEXT: addi a1, a0, 320 145; LMULMAX4-NEXT: vse32.v v24, (a1) 146; LMULMAX4-NEXT: addi a1, a0, 256 147; LMULMAX4-NEXT: vse32.v v20, (a1) 148; LMULMAX4-NEXT: addi a1, a0, 192 149; LMULMAX4-NEXT: vse32.v v16, (a1) 150; LMULMAX4-NEXT: addi a1, a0, 128 151; LMULMAX4-NEXT: vse32.v v12, (a1) 152; LMULMAX4-NEXT: addi a1, a0, 64 153; LMULMAX4-NEXT: vse32.v v8, (a1) 154; LMULMAX4-NEXT: vse32.v v28, (a0) 155; LMULMAX4-NEXT: ret 156 %v = load <128 x i32>, <128 x i32>* %x 157 ret <128 x i32> %v 158} 159 160define fastcc <4 x i8> @ret_v8i8_param_v4i8(<4 x i8> %v) { 161; CHECK-LABEL: ret_v8i8_param_v4i8: 162; CHECK: # %bb.0: 163; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu 164; CHECK-NEXT: vadd.vi v8, v8, 2 165; CHECK-NEXT: ret 166 %r = add <4 x i8> %v, <i8 2, i8 2, i8 2, i8 2> 167 ret <4 x i8> %r 168} 169 170define fastcc <4 x i8> @ret_v4i8_param_v4i8_v4i8(<4 x i8> %v, <4 x i8> %w) { 171; CHECK-LABEL: ret_v4i8_param_v4i8_v4i8: 172; CHECK: # %bb.0: 173; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu 174; CHECK-NEXT: vadd.vv v8, v8, v9 175; CHECK-NEXT: ret 176 %r = add <4 x i8> %v, %w 177 ret <4 x i8> %r 178} 179 180define fastcc <4 x i64> @ret_v4i64_param_v4i64_v4i64(<4 x i64> %v, <4 x i64> %w) { 181; CHECK-LABEL: ret_v4i64_param_v4i64_v4i64: 182; CHECK: # %bb.0: 183; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu 184; CHECK-NEXT: vadd.vv v8, v8, v10 185; CHECK-NEXT: ret 186 %r = add <4 x i64> %v, %w 187 ret <4 x i64> %r 188} 189 190define fastcc <8 x i1> @ret_v8i1_param_v8i1_v8i1(<8 x i1> %v, <8 x i1> %w) { 191; CHECK-LABEL: ret_v8i1_param_v8i1_v8i1: 192; CHECK: # %bb.0: 193; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu 194; CHECK-NEXT: vmxor.mm v0, v0, v8 195; CHECK-NEXT: ret 196 %r = xor <8 x i1> %v, %w 197 ret <8 x i1> %r 198} 199 200define fastcc <32 x i1> @ret_v32i1_param_v32i1_v32i1(<32 x i1> %v, <32 x i1> %w) { 201; CHECK-LABEL: ret_v32i1_param_v32i1_v32i1: 202; CHECK: # %bb.0: 203; CHECK-NEXT: addi a0, zero, 32 204; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu 205; CHECK-NEXT: vmand.mm v0, v0, v8 206; CHECK-NEXT: ret 207 %r = and <32 x i1> %v, %w 208 ret <32 x i1> %r 209} 210 211define fastcc <32 x i32> @ret_v32i32_param_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) { 212; LMULMAX8-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32: 213; LMULMAX8: # %bb.0: 214; LMULMAX8-NEXT: addi a2, zero, 32 215; LMULMAX8-NEXT: vsetvli zero, a2, e32, m8, ta, mu 216; LMULMAX8-NEXT: vle32.v v24, (a0) 217; LMULMAX8-NEXT: vadd.vv v8, v8, v16 218; LMULMAX8-NEXT: vadd.vv v8, v8, v24 219; LMULMAX8-NEXT: vadd.vx v8, v8, a1 220; LMULMAX8-NEXT: ret 221; 222; LMULMAX4-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32: 223; LMULMAX4: # %bb.0: 224; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, mu 225; LMULMAX4-NEXT: addi a1, a0, 64 226; LMULMAX4-NEXT: vle32.v v28, (a1) 227; LMULMAX4-NEXT: vle32.v v24, (a0) 228; LMULMAX4-NEXT: vadd.vv v8, v8, v16 229; LMULMAX4-NEXT: vadd.vv v12, v12, v20 230; LMULMAX4-NEXT: vadd.vv v28, v12, v28 231; LMULMAX4-NEXT: vadd.vv v8, v8, v24 232; LMULMAX4-NEXT: vadd.vx v8, v8, a2 233; LMULMAX4-NEXT: vadd.vx v12, v28, a2 234; LMULMAX4-NEXT: ret 235 %r = add <32 x i32> %x, %y 236 %s = add <32 x i32> %r, %z 237 %head = insertelement <32 x i32> undef, i32 %w, i32 0 238 %splat = shufflevector <32 x i32> %head, <32 x i32> undef, <32 x i32> zeroinitializer 239 %t = add <32 x i32> %s, %splat 240 ret <32 x i32> %t 241} 242 243declare <32 x i32> @ext2(<32 x i32>, <32 x i32>, i32, i32) 244declare <32 x i32> @ext3(<32 x i32>, <32 x i32>, <32 x i32>, i32, i32) 245 246define fastcc <32 x i32> @ret_v32i32_call_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, i32 %w) { 247; LMULMAX8-LABEL: ret_v32i32_call_v32i32_v32i32_i32: 248; LMULMAX8: # %bb.0: 249; LMULMAX8-NEXT: addi sp, sp, -16 250; LMULMAX8-NEXT: .cfi_def_cfa_offset 16 251; LMULMAX8-NEXT: sd ra, 8(sp) # 8-byte Folded Spill 252; LMULMAX8-NEXT: .cfi_offset ra, -8 253; LMULMAX8-NEXT: vmv8r.v v24, v8 254; LMULMAX8-NEXT: addi a1, zero, 2 255; LMULMAX8-NEXT: vmv8r.v v8, v16 256; LMULMAX8-NEXT: vmv8r.v v16, v24 257; LMULMAX8-NEXT: call ext2@plt 258; LMULMAX8-NEXT: ld ra, 8(sp) # 8-byte Folded Reload 259; LMULMAX8-NEXT: addi sp, sp, 16 260; LMULMAX8-NEXT: ret 261; 262; LMULMAX4-LABEL: ret_v32i32_call_v32i32_v32i32_i32: 263; LMULMAX4: # %bb.0: 264; LMULMAX4-NEXT: addi sp, sp, -16 265; LMULMAX4-NEXT: .cfi_def_cfa_offset 16 266; LMULMAX4-NEXT: sd ra, 8(sp) # 8-byte Folded Spill 267; LMULMAX4-NEXT: .cfi_offset ra, -8 268; LMULMAX4-NEXT: vmv4r.v v28, v12 269; LMULMAX4-NEXT: vmv4r.v v24, v8 270; LMULMAX4-NEXT: addi a1, zero, 2 271; LMULMAX4-NEXT: vmv4r.v v8, v16 272; LMULMAX4-NEXT: vmv4r.v v12, v20 273; LMULMAX4-NEXT: vmv4r.v v16, v24 274; LMULMAX4-NEXT: vmv4r.v v20, v28 275; LMULMAX4-NEXT: call ext2@plt 276; LMULMAX4-NEXT: ld ra, 8(sp) # 8-byte Folded Reload 277; LMULMAX4-NEXT: addi sp, sp, 16 278; LMULMAX4-NEXT: ret 279 %t = call fastcc <32 x i32> @ext2(<32 x i32> %y, <32 x i32> %x, i32 %w, i32 2) 280 ret <32 x i32> %t 281} 282 283define fastcc <32 x i32> @ret_v32i32_call_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) { 284; LMULMAX8-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: 285; LMULMAX8: # %bb.0: 286; LMULMAX8-NEXT: addi sp, sp, -384 287; LMULMAX8-NEXT: .cfi_def_cfa_offset 384 288; LMULMAX8-NEXT: sd ra, 376(sp) # 8-byte Folded Spill 289; LMULMAX8-NEXT: sd s0, 368(sp) # 8-byte Folded Spill 290; LMULMAX8-NEXT: .cfi_offset ra, -8 291; LMULMAX8-NEXT: .cfi_offset s0, -16 292; LMULMAX8-NEXT: addi s0, sp, 384 293; LMULMAX8-NEXT: .cfi_def_cfa s0, 0 294; LMULMAX8-NEXT: andi sp, sp, -128 295; LMULMAX8-NEXT: addi a2, zero, 32 296; LMULMAX8-NEXT: vsetvli zero, a2, e32, m8, ta, mu 297; LMULMAX8-NEXT: vle32.v v24, (a0) 298; LMULMAX8-NEXT: addi a0, sp, 128 299; LMULMAX8-NEXT: addi a2, zero, 42 300; LMULMAX8-NEXT: addi a3, sp, 128 301; LMULMAX8-NEXT: vse32.v v8, (a3) 302; LMULMAX8-NEXT: vmv8r.v v8, v24 303; LMULMAX8-NEXT: call ext3@plt 304; LMULMAX8-NEXT: addi sp, s0, -384 305; LMULMAX8-NEXT: ld s0, 368(sp) # 8-byte Folded Reload 306; LMULMAX8-NEXT: ld ra, 376(sp) # 8-byte Folded Reload 307; LMULMAX8-NEXT: addi sp, sp, 384 308; LMULMAX8-NEXT: ret 309; 310; LMULMAX4-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: 311; LMULMAX4: # %bb.0: 312; LMULMAX4-NEXT: addi sp, sp, -384 313; LMULMAX4-NEXT: .cfi_def_cfa_offset 384 314; LMULMAX4-NEXT: sd ra, 376(sp) # 8-byte Folded Spill 315; LMULMAX4-NEXT: sd s0, 368(sp) # 8-byte Folded Spill 316; LMULMAX4-NEXT: .cfi_offset ra, -8 317; LMULMAX4-NEXT: .cfi_offset s0, -16 318; LMULMAX4-NEXT: addi s0, sp, 384 319; LMULMAX4-NEXT: .cfi_def_cfa s0, 0 320; LMULMAX4-NEXT: andi sp, sp, -128 321; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, mu 322; LMULMAX4-NEXT: vle32.v v28, (a0) 323; LMULMAX4-NEXT: addi a0, a0, 64 324; LMULMAX4-NEXT: vle32.v v24, (a0) 325; LMULMAX4-NEXT: addi a0, sp, 192 326; LMULMAX4-NEXT: vse32.v v12, (a0) 327; LMULMAX4-NEXT: addi a0, sp, 128 328; LMULMAX4-NEXT: addi a3, zero, 42 329; LMULMAX4-NEXT: addi a1, sp, 128 330; LMULMAX4-NEXT: vse32.v v8, (a1) 331; LMULMAX4-NEXT: vmv4r.v v8, v28 332; LMULMAX4-NEXT: vmv4r.v v12, v24 333; LMULMAX4-NEXT: call ext3@plt 334; LMULMAX4-NEXT: addi sp, s0, -384 335; LMULMAX4-NEXT: ld s0, 368(sp) # 8-byte Folded Reload 336; LMULMAX4-NEXT: ld ra, 376(sp) # 8-byte Folded Reload 337; LMULMAX4-NEXT: addi sp, sp, 384 338; LMULMAX4-NEXT: ret 339 %t = call fastcc <32 x i32> @ext3(<32 x i32> %z, <32 x i32> %y, <32 x i32> %x, i32 %w, i32 42) 340 ret <32 x i32> %t 341} 342 343; A test case where the normal calling convention would pass directly via the 344; stack, but with fastcc can pass indirectly with the extra GPR registers 345; allowed. 346define fastcc <32 x i32> @vector_arg_indirect_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %8) { 347; LMULMAX8-LABEL: vector_arg_indirect_stack: 348; LMULMAX8: # %bb.0: 349; LMULMAX8-NEXT: addi a0, zero, 32 350; LMULMAX8-NEXT: vsetvli zero, a0, e32, m8, ta, mu 351; LMULMAX8-NEXT: vle32.v v16, (t2) 352; LMULMAX8-NEXT: vadd.vv v8, v8, v16 353; LMULMAX8-NEXT: ret 354; 355; LMULMAX4-LABEL: vector_arg_indirect_stack: 356; LMULMAX4: # %bb.0: 357; LMULMAX4-NEXT: addi a0, t2, 64 358; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, mu 359; LMULMAX4-NEXT: vle32.v v28, (t2) 360; LMULMAX4-NEXT: vle32.v v16, (a0) 361; LMULMAX4-NEXT: vadd.vv v8, v8, v28 362; LMULMAX4-NEXT: vadd.vv v12, v12, v16 363; LMULMAX4-NEXT: ret 364 %s = add <32 x i32> %x, %z 365 ret <32 x i32> %s 366} 367 368; Calling the function above. Ensure we pass the arguments correctly. 369define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) { 370; LMULMAX8-LABEL: pass_vector_arg_indirect_stack: 371; LMULMAX8: # %bb.0: 372; LMULMAX8-NEXT: addi sp, sp, -384 373; LMULMAX8-NEXT: .cfi_def_cfa_offset 384 374; LMULMAX8-NEXT: sd ra, 376(sp) # 8-byte Folded Spill 375; LMULMAX8-NEXT: sd s0, 368(sp) # 8-byte Folded Spill 376; LMULMAX8-NEXT: .cfi_offset ra, -8 377; LMULMAX8-NEXT: .cfi_offset s0, -16 378; LMULMAX8-NEXT: addi s0, sp, 384 379; LMULMAX8-NEXT: .cfi_def_cfa s0, 0 380; LMULMAX8-NEXT: andi sp, sp, -128 381; LMULMAX8-NEXT: addi a0, zero, 32 382; LMULMAX8-NEXT: vsetvli zero, a0, e32, m8, ta, mu 383; LMULMAX8-NEXT: vmv.v.i v8, 0 384; LMULMAX8-NEXT: addi a1, zero, 1 385; LMULMAX8-NEXT: addi a2, zero, 2 386; LMULMAX8-NEXT: addi a3, zero, 3 387; LMULMAX8-NEXT: addi a4, zero, 4 388; LMULMAX8-NEXT: addi a5, zero, 5 389; LMULMAX8-NEXT: addi a6, zero, 6 390; LMULMAX8-NEXT: addi a7, zero, 7 391; LMULMAX8-NEXT: addi t2, sp, 128 392; LMULMAX8-NEXT: addi t3, zero, 8 393; LMULMAX8-NEXT: addi a0, sp, 128 394; LMULMAX8-NEXT: vse32.v v8, (a0) 395; LMULMAX8-NEXT: mv a0, zero 396; LMULMAX8-NEXT: vmv8r.v v16, v8 397; LMULMAX8-NEXT: call vector_arg_indirect_stack@plt 398; LMULMAX8-NEXT: addi sp, s0, -384 399; LMULMAX8-NEXT: ld s0, 368(sp) # 8-byte Folded Reload 400; LMULMAX8-NEXT: ld ra, 376(sp) # 8-byte Folded Reload 401; LMULMAX8-NEXT: addi sp, sp, 384 402; LMULMAX8-NEXT: ret 403; 404; LMULMAX4-LABEL: pass_vector_arg_indirect_stack: 405; LMULMAX4: # %bb.0: 406; LMULMAX4-NEXT: addi sp, sp, -384 407; LMULMAX4-NEXT: .cfi_def_cfa_offset 384 408; LMULMAX4-NEXT: sd ra, 376(sp) # 8-byte Folded Spill 409; LMULMAX4-NEXT: sd s0, 368(sp) # 8-byte Folded Spill 410; LMULMAX4-NEXT: .cfi_offset ra, -8 411; LMULMAX4-NEXT: .cfi_offset s0, -16 412; LMULMAX4-NEXT: addi s0, sp, 384 413; LMULMAX4-NEXT: .cfi_def_cfa s0, 0 414; LMULMAX4-NEXT: andi sp, sp, -128 415; LMULMAX4-NEXT: addi a0, sp, 192 416; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, mu 417; LMULMAX4-NEXT: vmv.v.i v8, 0 418; LMULMAX4-NEXT: vse32.v v8, (a0) 419; LMULMAX4-NEXT: addi a1, zero, 1 420; LMULMAX4-NEXT: addi a2, zero, 2 421; LMULMAX4-NEXT: addi a3, zero, 3 422; LMULMAX4-NEXT: addi a4, zero, 4 423; LMULMAX4-NEXT: addi a5, zero, 5 424; LMULMAX4-NEXT: addi a6, zero, 6 425; LMULMAX4-NEXT: addi a7, zero, 7 426; LMULMAX4-NEXT: addi t2, sp, 128 427; LMULMAX4-NEXT: addi t4, zero, 8 428; LMULMAX4-NEXT: addi a0, sp, 128 429; LMULMAX4-NEXT: vse32.v v8, (a0) 430; LMULMAX4-NEXT: mv a0, zero 431; LMULMAX4-NEXT: vmv4r.v v12, v8 432; LMULMAX4-NEXT: vmv4r.v v16, v8 433; LMULMAX4-NEXT: vmv4r.v v20, v8 434; LMULMAX4-NEXT: call vector_arg_indirect_stack@plt 435; LMULMAX4-NEXT: addi sp, s0, -384 436; LMULMAX4-NEXT: ld s0, 368(sp) # 8-byte Folded Reload 437; LMULMAX4-NEXT: ld ra, 376(sp) # 8-byte Folded Reload 438; LMULMAX4-NEXT: addi sp, sp, 384 439; LMULMAX4-NEXT: ret 440 %s = call fastcc <32 x i32> @vector_arg_indirect_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 8) 441 ret <32 x i32> %s 442} 443 444; A pathological test case where even with fastcc we must use the stack for arguments %13 and %z 445define fastcc <32 x i32> @vector_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %last) { 446; LMULMAX8-LABEL: vector_arg_direct_stack: 447; LMULMAX8: # %bb.0: 448; LMULMAX8-NEXT: addi sp, sp, -16 449; LMULMAX8-NEXT: .cfi_def_cfa_offset 16 450; LMULMAX8-NEXT: addi a0, zero, 32 451; LMULMAX8-NEXT: vsetvli zero, a0, e32, m8, ta, mu 452; LMULMAX8-NEXT: addi a0, sp, 24 453; LMULMAX8-NEXT: vle32.v v24, (a0) 454; LMULMAX8-NEXT: vadd.vv v8, v8, v16 455; LMULMAX8-NEXT: vadd.vv v8, v8, v24 456; LMULMAX8-NEXT: addi sp, sp, 16 457; LMULMAX8-NEXT: ret 458; 459; LMULMAX4-LABEL: vector_arg_direct_stack: 460; LMULMAX4: # %bb.0: 461; LMULMAX4-NEXT: addi sp, sp, -16 462; LMULMAX4-NEXT: .cfi_def_cfa_offset 16 463; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, mu 464; LMULMAX4-NEXT: addi a0, sp, 24 465; LMULMAX4-NEXT: vle32.v v28, (a0) 466; LMULMAX4-NEXT: addi a0, sp, 88 467; LMULMAX4-NEXT: vle32.v v24, (a0) 468; LMULMAX4-NEXT: vadd.vv v12, v12, v20 469; LMULMAX4-NEXT: vadd.vv v8, v8, v16 470; LMULMAX4-NEXT: vadd.vv v8, v8, v28 471; LMULMAX4-NEXT: vadd.vv v12, v12, v24 472; LMULMAX4-NEXT: addi sp, sp, 16 473; LMULMAX4-NEXT: ret 474 %s = add <32 x i32> %x, %y 475 %t = add <32 x i32> %s, %z 476 ret <32 x i32> %t 477} 478 479; Calling the function above. Ensure we pass the arguments correctly. 480define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) { 481; LMULMAX8-LABEL: pass_vector_arg_direct_stack: 482; LMULMAX8: # %bb.0: 483; LMULMAX8-NEXT: addi sp, sp, -160 484; LMULMAX8-NEXT: .cfi_def_cfa_offset 160 485; LMULMAX8-NEXT: sd ra, 152(sp) # 8-byte Folded Spill 486; LMULMAX8-NEXT: .cfi_offset ra, -8 487; LMULMAX8-NEXT: addi a0, zero, 32 488; LMULMAX8-NEXT: vsetvli zero, a0, e32, m8, ta, mu 489; LMULMAX8-NEXT: vmv.v.i v8, 0 490; LMULMAX8-NEXT: addi a0, sp, 8 491; LMULMAX8-NEXT: vse32.v v8, (a0) 492; LMULMAX8-NEXT: addi a0, zero, 1 493; LMULMAX8-NEXT: sd a0, 136(sp) 494; LMULMAX8-NEXT: addi a0, zero, 13 495; LMULMAX8-NEXT: addi a1, zero, 1 496; LMULMAX8-NEXT: addi a2, zero, 2 497; LMULMAX8-NEXT: addi a3, zero, 3 498; LMULMAX8-NEXT: addi a4, zero, 4 499; LMULMAX8-NEXT: addi a5, zero, 5 500; LMULMAX8-NEXT: addi a6, zero, 6 501; LMULMAX8-NEXT: addi a7, zero, 7 502; LMULMAX8-NEXT: addi t2, zero, 8 503; LMULMAX8-NEXT: addi t3, zero, 9 504; LMULMAX8-NEXT: addi t4, zero, 10 505; LMULMAX8-NEXT: addi t5, zero, 11 506; LMULMAX8-NEXT: addi t6, zero, 12 507; LMULMAX8-NEXT: sd a0, 0(sp) 508; LMULMAX8-NEXT: mv a0, zero 509; LMULMAX8-NEXT: vmv8r.v v16, v8 510; LMULMAX8-NEXT: call vector_arg_direct_stack@plt 511; LMULMAX8-NEXT: ld ra, 152(sp) # 8-byte Folded Reload 512; LMULMAX8-NEXT: addi sp, sp, 160 513; LMULMAX8-NEXT: ret 514; 515; LMULMAX4-LABEL: pass_vector_arg_direct_stack: 516; LMULMAX4: # %bb.0: 517; LMULMAX4-NEXT: addi sp, sp, -160 518; LMULMAX4-NEXT: .cfi_def_cfa_offset 160 519; LMULMAX4-NEXT: sd ra, 152(sp) # 8-byte Folded Spill 520; LMULMAX4-NEXT: .cfi_offset ra, -8 521; LMULMAX4-NEXT: addi a0, zero, 1 522; LMULMAX4-NEXT: sd a0, 136(sp) 523; LMULMAX4-NEXT: addi a0, zero, 13 524; LMULMAX4-NEXT: sd a0, 0(sp) 525; LMULMAX4-NEXT: addi a0, sp, 72 526; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, mu 527; LMULMAX4-NEXT: vmv.v.i v8, 0 528; LMULMAX4-NEXT: vse32.v v8, (a0) 529; LMULMAX4-NEXT: addi a0, sp, 8 530; LMULMAX4-NEXT: addi a1, zero, 1 531; LMULMAX4-NEXT: addi a2, zero, 2 532; LMULMAX4-NEXT: addi a3, zero, 3 533; LMULMAX4-NEXT: addi a4, zero, 4 534; LMULMAX4-NEXT: addi a5, zero, 5 535; LMULMAX4-NEXT: addi a6, zero, 6 536; LMULMAX4-NEXT: addi a7, zero, 7 537; LMULMAX4-NEXT: addi t2, zero, 8 538; LMULMAX4-NEXT: addi t3, zero, 9 539; LMULMAX4-NEXT: addi t4, zero, 10 540; LMULMAX4-NEXT: addi t5, zero, 11 541; LMULMAX4-NEXT: addi t6, zero, 12 542; LMULMAX4-NEXT: vse32.v v8, (a0) 543; LMULMAX4-NEXT: mv a0, zero 544; LMULMAX4-NEXT: vmv4r.v v12, v8 545; LMULMAX4-NEXT: vmv4r.v v16, v8 546; LMULMAX4-NEXT: vmv4r.v v20, v8 547; LMULMAX4-NEXT: call vector_arg_direct_stack@plt 548; LMULMAX4-NEXT: ld ra, 152(sp) # 8-byte Folded Reload 549; LMULMAX4-NEXT: addi sp, sp, 160 550; LMULMAX4-NEXT: ret 551 %s = call fastcc <32 x i32> @vector_arg_direct_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 1) 552 ret <32 x i32> %s 553} 554 555; A pathological test case where even with fastcc we must use the stack for 556; mask argument %m2. %m1 is passed via v0. 557define fastcc <4 x i1> @vector_mask_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, <4 x i1> %m1, <4 x i1> %m2, i32 %last) { 558; CHECK-LABEL: vector_mask_arg_direct_stack: 559; CHECK: # %bb.0: 560; CHECK-NEXT: addi sp, sp, -16 561; CHECK-NEXT: .cfi_def_cfa_offset 16 562; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, mu 563; CHECK-NEXT: addi a0, sp, 152 564; CHECK-NEXT: vle1.v v25, (a0) 565; CHECK-NEXT: vmxor.mm v0, v0, v25 566; CHECK-NEXT: addi sp, sp, 16 567; CHECK-NEXT: ret 568 %r = xor <4 x i1> %m1, %m2 569 ret <4 x i1> %r 570} 571