1; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
2; REQUIRES: asserts
3
4; Verify that LV can handle explicit vectorization outer loops with uniform branches
5; but bails out on outer loops with divergent branches.
6
7; Root C/C++ source code for the test cases
8; void foo(int *a, int *b, int N, int M)
9; {
10;   int i, j;
11; #pragma clang loop vectorize(enable) vectorize_width(8)
12;   for (i = 0; i < N; i++) {
13;     // Tested conditional branch. COND will be replaced per test.
14;     if (COND)
15;       for (j = 0; j < M; j++) {
16;         a[i*M+j] = b[i*M+j] * b[i*M+j];
17;       }
18;   }
19; }
20
21; Case 1 (COND => M == N): Outer loop with uniform conditional branch.
22
23; CHECK-LABEL: uniform_branch
24; CHECK: LV: We can vectorize this outer loop!
25
26target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
27
28define void @uniform_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
29entry:
30  %cmp39 = icmp sgt i32 %N, 0
31  br i1 %cmp39, label %outer.ph, label %for.end19
32
33outer.ph:                                   ; preds = %entry
34  %cmp337 = icmp slt i32 %M, 1
35  %0 = sext i32 %M to i64
36  %N64 = zext i32 %N to i64
37  %M64 = zext i32 %M to i64
38  %cmp1 = icmp ne i32 %M, %N ; Uniform condition
39  %brmerge = or i1 %cmp1, %cmp337 ; Uniform condition
40  br label %outer.body
41
42outer.body:                                 ; preds = %outer.inc, %outer.ph
43  %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ]
44  %1 = mul nsw i64 %indvars.iv42, %0
45  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1
46  %2 = load i32, i32* %arrayidx, align 4, !tbaa !2
47  br i1 %brmerge, label %outer.inc, label %inner.ph ; Supported uniform branch
48
49inner.ph:                                   ; preds = %outer.body
50  br label %inner.body
51
52inner.body:                                 ; preds = %inner.ph, %inner.body
53  %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ]
54  %3 = add nsw i64 %indvars.iv, %1
55  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3
56  %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2
57  %mul12 = mul nsw i32 %4, %4
58  %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3
59  store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2
60  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
61  %exitcond = icmp eq i64 %indvars.iv.next, %M64
62  br i1 %exitcond, label %outer.inc, label %inner.body
63
64outer.inc:                                  ; preds = %inner.body, %outer.body
65  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
66  %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64
67  br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6
68
69for.end19:                                  ; preds = %outer.inc, %entry
70  ret void
71}
72
73
74; Case 2 (COND => B[i * M] == 0): Outer loop with divergent conditional branch.
75
76; CHECK-LABEL: divergent_branch
77; CHECK: Unsupported conditional branch.
78; CHECK: LV: Not vectorizing: Unsupported outer loop.
79
80define void @divergent_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
81entry:
82  %cmp39 = icmp sgt i32 %N, 0
83  br i1 %cmp39, label %outer.ph, label %for.end19
84
85outer.ph:                                   ; preds = %entry
86  %cmp337 = icmp slt i32 %M, 1
87  %0 = sext i32 %M to i64
88  %N64 = zext i32 %N to i64
89  %M64 = zext i32 %M to i64
90  br label %outer.body
91
92outer.body:                                 ; preds = %outer.inc, %outer.ph
93  %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ]
94  %1 = mul nsw i64 %indvars.iv42, %0
95  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1
96  %2 = load i32, i32* %arrayidx, align 4, !tbaa !2
97  %cmp1 = icmp ne i32 %2, 0 ; Divergent condition
98  %brmerge = or i1 %cmp1, %cmp337 ; Divergent condition
99  br i1 %brmerge, label %outer.inc, label %inner.ph ; Unsupported divergent branch.
100
101inner.ph:                                   ; preds = %outer.body
102  br label %inner.body
103
104inner.body:                                 ; preds = %inner.ph, %inner.body
105  %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ]
106  %3 = add nsw i64 %indvars.iv, %1
107  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3
108  %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2
109  %mul12 = mul nsw i32 %4, %4
110  %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3
111  store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2
112  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
113  %exitcond = icmp eq i64 %indvars.iv.next, %M64
114  br i1 %exitcond, label %outer.inc, label %inner.body
115
116outer.inc:                                  ; preds = %inner.body, %outer.body
117  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
118  %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64
119  br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6
120
121for.end19:                                  ; preds = %outer.inc, %entry
122  ret void
123}
124
125!llvm.module.flags = !{!0}
126!llvm.ident = !{!1}
127
128!0 = !{i32 1, !"wchar_size", i32 4}
129!1 = !{!"clang version 6.0.0"}
130!2 = !{!3, !3, i64 0}
131!3 = !{!"int", !4, i64 0}
132!4 = !{!"omnipotent char", !5, i64 0}
133!5 = !{!"Simple C/C++ TBAA"}
134!6 = distinct !{!6, !7, !8}
135!7 = !{!"llvm.loop.vectorize.width", i32 8}
136!8 = !{!"llvm.loop.vectorize.enable", i1 true}
137