1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -disable-lsr \
3; RUN:   -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 < %s | FileCheck %s
4; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -disable-lsr \
5; RUN:   -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr10 < %s | FileCheck %s \
6; RUN:   --check-prefix=CHECK-BE
7
8; This test checks the PPCLoopInstrFormPrep pass supports the lxvp and stxvp
9; intrinsics so we generate more dq-form instructions instead of x-forms.
10
11%_elem_type_of_x = type <{ double }>
12%_elem_type_of_y = type <{ double }>
13
14define void @foo(i64* %.n, [0 x %_elem_type_of_x]* %.x, [0 x %_elem_type_of_y]* %.y, <2 x double>* %.sum) {
15; CHECK-LABEL: foo:
16; CHECK:       # %bb.0: # %entry
17; CHECK-NEXT:    ld r5, 0(r3)
18; CHECK-NEXT:    cmpdi r5, 1
19; CHECK-NEXT:    bltlr cr0
20; CHECK-NEXT:  # %bb.1: # %_loop_1_do_.lr.ph
21; CHECK-NEXT:    addi r3, r4, 1
22; CHECK-NEXT:    addi r4, r5, -1
23; CHECK-NEXT:    lxv vs0, 0(r6)
24; CHECK-NEXT:    rldicl r4, r4, 60, 4
25; CHECK-NEXT:    addi r4, r4, 1
26; CHECK-NEXT:    mtctr r4
27; CHECK-NEXT:    .p2align 5
28; CHECK-NEXT:  .LBB0_2: # %_loop_1_do_
29; CHECK-NEXT:    #
30; CHECK-NEXT:    lxvp vsp2, 0(r3)
31; CHECK-NEXT:    lxvp vsp4, 32(r3)
32; CHECK-NEXT:    addi r3, r3, 128
33; CHECK-NEXT:    xvadddp vs0, vs0, vs3
34; CHECK-NEXT:    xvadddp vs0, vs0, vs2
35; CHECK-NEXT:    xvadddp vs0, vs0, vs5
36; CHECK-NEXT:    xvadddp vs0, vs0, vs4
37; CHECK-NEXT:    bdnz .LBB0_2
38; CHECK-NEXT:  # %bb.3: # %_loop_1_loopHeader_._return_bb_crit_edge
39; CHECK-NEXT:    stxv vs0, 0(r6)
40; CHECK-NEXT:    blr
41;
42; CHECK-BE-LABEL: foo:
43; CHECK-BE:       # %bb.0: # %entry
44; CHECK-BE-NEXT:    ld r5, 0(r3)
45; CHECK-BE-NEXT:    cmpdi r5, 1
46; CHECK-BE-NEXT:    bltlr cr0
47; CHECK-BE-NEXT:  # %bb.1: # %_loop_1_do_.lr.ph
48; CHECK-BE-NEXT:    addi r3, r4, 1
49; CHECK-BE-NEXT:    addi r4, r5, -1
50; CHECK-BE-NEXT:    lxv vs0, 0(r6)
51; CHECK-BE-NEXT:    rldicl r4, r4, 60, 4
52; CHECK-BE-NEXT:    addi r4, r4, 1
53; CHECK-BE-NEXT:    mtctr r4
54; CHECK-BE-NEXT:    .p2align 5
55; CHECK-BE-NEXT:  .LBB0_2: # %_loop_1_do_
56; CHECK-BE-NEXT:    #
57; CHECK-BE-NEXT:    lxvp vsp2, 0(r3)
58; CHECK-BE-NEXT:    lxvp vsp4, 32(r3)
59; CHECK-BE-NEXT:    addi r3, r3, 128
60; CHECK-BE-NEXT:    xvadddp vs0, vs0, vs2
61; CHECK-BE-NEXT:    xvadddp vs0, vs0, vs3
62; CHECK-BE-NEXT:    xvadddp vs0, vs0, vs4
63; CHECK-BE-NEXT:    xvadddp vs0, vs0, vs5
64; CHECK-BE-NEXT:    bdnz .LBB0_2
65; CHECK-BE-NEXT:  # %bb.3: # %_loop_1_loopHeader_._return_bb_crit_edge
66; CHECK-BE-NEXT:    stxv vs0, 0(r6)
67; CHECK-BE-NEXT:    blr
68entry:
69  %_val_n_2 = load i64, i64* %.n, align 8
70  %_grt_tmp7 = icmp slt i64 %_val_n_2, 1
71  br i1 %_grt_tmp7, label %_return_bb, label %_loop_1_do_.lr.ph
72
73_loop_1_do_.lr.ph:                                ; preds = %entry
74  %x_rvo_based_addr_5 = getelementptr inbounds [0 x %_elem_type_of_x], [0 x %_elem_type_of_x]* %.x, i64 0, i64 -1
75  %.sum.promoted = load <2 x double>, <2 x double>* %.sum, align 16
76  br label %_loop_1_do_
77
78_loop_1_do_:                                      ; preds = %_loop_1_do_.lr.ph, %_loop_1_do_
79  %_val_sum_9 = phi <2 x double> [ %.sum.promoted, %_loop_1_do_.lr.ph ], [ %_add_tmp49, %_loop_1_do_ ]
80  %i.08 = phi i64 [ 1, %_loop_1_do_.lr.ph ], [ %_loop_1_update_loop_ix, %_loop_1_do_ ]
81  %x_ix_dim_0_6 = getelementptr %_elem_type_of_x, %_elem_type_of_x* %x_rvo_based_addr_5, i64 %i.08
82  %x_ix_dim_0_ = bitcast %_elem_type_of_x* %x_ix_dim_0_6 to i8*
83  %0 = getelementptr i8, i8* %x_ix_dim_0_, i64 1
84  %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %0)
85  %2 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %1)
86  %.fca.0.extract1 = extractvalue { <16 x i8>, <16 x i8> } %2, 0
87  %.fca.1.extract2 = extractvalue { <16 x i8>, <16 x i8> } %2, 1
88  %3 = getelementptr i8, i8* %x_ix_dim_0_, i64 33
89  %4 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(i8* %3)
90  %5 = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %4)
91  %.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %5, 0
92  %.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %5, 1
93  %6 = bitcast <16 x i8> %.fca.0.extract1 to <2 x double>
94  %_add_tmp23 = fadd contract <2 x double> %_val_sum_9, %6
95  %7 = bitcast <16 x i8> %.fca.1.extract2 to <2 x double>
96  %_add_tmp32 = fadd contract <2 x double> %_add_tmp23, %7
97  %8 = bitcast <16 x i8> %.fca.0.extract to <2 x double>
98  %_add_tmp40 = fadd contract <2 x double> %_add_tmp32, %8
99  %9 = bitcast <16 x i8> %.fca.1.extract to <2 x double>
100  %_add_tmp49 = fadd contract <2 x double> %_add_tmp40, %9
101  %_loop_1_update_loop_ix = add nuw nsw i64 %i.08, 16
102  %_grt_tmp = icmp sgt i64 %_loop_1_update_loop_ix, %_val_n_2
103  br i1 %_grt_tmp, label %_loop_1_loopHeader_._return_bb_crit_edge, label %_loop_1_do_
104
105_loop_1_loopHeader_._return_bb_crit_edge:         ; preds = %_loop_1_do_
106  store <2 x double> %_add_tmp49, <2 x double>* %.sum, align 16
107  br label %_return_bb
108
109_return_bb:                                       ; preds = %_loop_1_loopHeader_._return_bb_crit_edge, %entry
110  ret void
111}
112
113declare <256 x i1> @llvm.ppc.vsx.lxvp(i8*)
114declare { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1>)
115