1; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s 2; REQUIRES: asserts 3 4; Verify that LV can handle explicit vectorization outer loops with uniform branches 5; but bails out on outer loops with divergent branches. 6 7; Root C/C++ source code for the test cases 8; void foo(int *a, int *b, int N, int M) 9; { 10; int i, j; 11; #pragma clang loop vectorize(enable) vectorize_width(8) 12; for (i = 0; i < N; i++) { 13; // Tested conditional branch. COND will be replaced per test. 14; if (COND) 15; for (j = 0; j < M; j++) { 16; a[i*M+j] = b[i*M+j] * b[i*M+j]; 17; } 18; } 19; } 20 21; Case 1 (COND => M == N): Outer loop with uniform conditional branch. 22 23; CHECK-LABEL: uniform_branch 24; CHECK: LV: We can vectorize this outer loop! 25 26target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 27 28define void @uniform_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { 29entry: 30 %cmp39 = icmp sgt i32 %N, 0 31 br i1 %cmp39, label %outer.ph, label %for.end19 32 33outer.ph: ; preds = %entry 34 %cmp337 = icmp slt i32 %M, 1 35 %0 = sext i32 %M to i64 36 %N64 = zext i32 %N to i64 37 %M64 = zext i32 %M to i64 38 %cmp1 = icmp ne i32 %M, %N ; Uniform condition 39 %brmerge = or i1 %cmp1, %cmp337 ; Uniform condition 40 br label %outer.body 41 42outer.body: ; preds = %outer.inc, %outer.ph 43 %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ] 44 %1 = mul nsw i64 %indvars.iv42, %0 45 %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1 46 %2 = load i32, i32* %arrayidx, align 4, !tbaa !2 47 br i1 %brmerge, label %outer.inc, label %inner.ph ; Supported uniform branch 48 49inner.ph: ; preds = %outer.body 50 br label %inner.body 51 52inner.body: ; preds = %inner.ph, %inner.body 53 %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ] 54 %3 = add nsw i64 %indvars.iv, %1 55 %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3 56 %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2 57 %mul12 = mul nsw i32 %4, %4 58 %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3 59 store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2 60 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 61 %exitcond = icmp eq i64 %indvars.iv.next, %M64 62 br i1 %exitcond, label %outer.inc, label %inner.body 63 64outer.inc: ; preds = %inner.body, %outer.body 65 %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1 66 %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64 67 br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6 68 69for.end19: ; preds = %outer.inc, %entry 70 ret void 71} 72 73 74; Case 2 (COND => B[i * M] == 0): Outer loop with divergent conditional branch. 75 76; CHECK-LABEL: divergent_branch 77; CHECK: Unsupported conditional branch. 78; CHECK: LV: Not vectorizing: Unsupported outer loop. 79 80define void @divergent_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr { 81entry: 82 %cmp39 = icmp sgt i32 %N, 0 83 br i1 %cmp39, label %outer.ph, label %for.end19 84 85outer.ph: ; preds = %entry 86 %cmp337 = icmp slt i32 %M, 1 87 %0 = sext i32 %M to i64 88 %N64 = zext i32 %N to i64 89 %M64 = zext i32 %M to i64 90 br label %outer.body 91 92outer.body: ; preds = %outer.inc, %outer.ph 93 %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ] 94 %1 = mul nsw i64 %indvars.iv42, %0 95 %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1 96 %2 = load i32, i32* %arrayidx, align 4, !tbaa !2 97 %cmp1 = icmp ne i32 %2, 0 ; Divergent condition 98 %brmerge = or i1 %cmp1, %cmp337 ; Divergent condition 99 br i1 %brmerge, label %outer.inc, label %inner.ph ; Unsupported divergent branch. 100 101inner.ph: ; preds = %outer.body 102 br label %inner.body 103 104inner.body: ; preds = %inner.ph, %inner.body 105 %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ] 106 %3 = add nsw i64 %indvars.iv, %1 107 %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3 108 %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2 109 %mul12 = mul nsw i32 %4, %4 110 %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3 111 store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2 112 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 113 %exitcond = icmp eq i64 %indvars.iv.next, %M64 114 br i1 %exitcond, label %outer.inc, label %inner.body 115 116outer.inc: ; preds = %inner.body, %outer.body 117 %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1 118 %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64 119 br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6 120 121for.end19: ; preds = %outer.inc, %entry 122 ret void 123} 124 125!llvm.module.flags = !{!0} 126!llvm.ident = !{!1} 127 128!0 = !{i32 1, !"wchar_size", i32 4} 129!1 = !{!"clang version 6.0.0"} 130!2 = !{!3, !3, i64 0} 131!3 = !{!"int", !4, i64 0} 132!4 = !{!"omnipotent char", !5, i64 0} 133!5 = !{!"Simple C/C++ TBAA"} 134!6 = distinct !{!6, !7, !8} 135!7 = !{!"llvm.loop.vectorize.width", i32 8} 136!8 = !{!"llvm.loop.vectorize.enable", i1 true} 137