1; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ 2; RUN: -polly-target-throughput-vector-fma=1 \ 3; RUN: -polly-target-latency-vector-fma=8 \ 4; RUN: -analyze -polly-ast -polly-target-1st-cache-level-associativity=8 \ 5; RUN: -polly-target-2nd-cache-level-associativity=8 \ 6; RUN: -polly-target-1st-cache-level-size=32768 \ 7; RUN: -polly-target-vector-register-bitwidth=256 \ 8; RUN: -polly-target-2nd-cache-level-size=262144 < %s \ 9; RUN: | FileCheck %s 10; 11; opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \ 12; -polly-target-throughput-vector-fma=1 \ 13; -polly-target-latency-vector-fma=8 \ 14; -polly-codegen -polly-target-1st-cache-level-associativity=8 \ 15; -polly-target-2nd-cache-level-associativity=8 \ 16; -polly-target-1st-cache-level-size=32768 \ 17; -polly-target-vector-register-bitwidth=256 \ 18; -polly-target-2nd-cache-level-size=262144 -gvn -licm -slp-vectorizer \ 19; -mcpu=corei7 -stats -S < %s 2>&1 | FileCheck %s --check-prefix=AUTO-VECTORIZATION 20; 21; 22; /* We isolate a set of partial tile prefixes, which contains only partial 23; tile prefixes that have exactly Mr x Nr iterations of the two innermost 24; loops produced by the optimization of the matrix multiplication. Mr and 25; Nr are parameters of the micro-kernel (see getMicroKernelParams and 26; getMacroKernelParams from lib/Transform/ScheduleOptimizer.cpp for 27; details). This test check that in case of parametric bounds it helps to 28; get rid of the conditional expressions of the unrolled innermost loops, 29; which prevents stores and loads of the unrolled loops from being sunk 30; and hoisted. Otherwise, it causes a run-time regression in comparison 31; to the vectorized code with sunk and hoisted memory accesses. */ 32; 33; /* C := A * B + C */ 34; for (i = 0; i < _PB_NI; i++) 35; for (j = 0; j < _PB_NJ; j++) 36; for (k = 0; k < _PB_NK; ++k) 37; C[i][j] += A[i][k] * B[k][j]; 38; 39; CHECK: if (ni >= 1) { 40; CHECK-NEXT: // Inter iteration alias-free 41; CHECK-NEXT: // 1st level tiling - Tiles 42; CHECK-NEXT: for (int c0 = 0; c0 <= floord(nj - 1, 2048); c0 += 1) 43; CHECK-NEXT: for (int c1 = 0; c1 <= floord(nk - 1, 256); c1 += 1) { 44; CHECK-NEXT: for (int c3 = 2048 * c0; c3 <= min(nj - 1, 2048 * c0 + 2047); c3 += 1) 45; CHECK-NEXT: for (int c4 = 256 * c1; c4 <= min(nk - 1, 256 * c1 + 255); c4 += 1) 46; CHECK-NEXT: CopyStmt_0(0, c3, c4); 47; CHECK-NEXT: for (int c2 = 0; c2 <= floord(ni - 1, 96); c2 += 1) { 48; CHECK-NEXT: if (c0 == 0) 49; CHECK-NEXT: for (int c3 = 96 * c2; c3 <= min(ni - 1, 96 * c2 + 95); c3 += 1) 50; CHECK-NEXT: for (int c5 = 256 * c1; c5 <= min(nk - 1, 256 * c1 + 255); c5 += 1) 51; CHECK-NEXT: CopyStmt_1(c3, 0, c5); 52; CHECK-NEXT: // 1st level tiling - Points 53; CHECK-NEXT: // Register tiling - Tiles 54; CHECK-NEXT: { 55; CHECK-NEXT: if (ni >= 96 * c2 + 4) 56; CHECK-NEXT: for (int c3 = 0; c3 <= min(255, -256 * c0 + nj / 8 - 1); c3 += 1) { 57; CHECK-NEXT: for (int c4 = 0; c4 <= min(23, -24 * c2 + ni / 4 - 1); c4 += 1) 58; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) { 59; CHECK-NEXT: // Loop Vectorizer Disabled 60; CHECK-NEXT: // Register tiling - Points 61; CHECK-NEXT: { 62; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3, 256 * c1 + c5); 63; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); 64; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); 65; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); 66; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); 67; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); 68; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); 69; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); 70; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3, 256 * c1 + c5); 71; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); 72; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); 73; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); 74; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); 75; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); 76; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); 77; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); 78; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3, 256 * c1 + c5); 79; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); 80; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); 81; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); 82; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); 83; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); 84; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); 85; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); 86; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3, 256 * c1 + c5); 87; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); 88; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); 89; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); 90; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); 91; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); 92; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); 93; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); 94; CHECK-NEXT: } 95; CHECK-NEXT: } 96; CHECK-NEXT: if ((ni >= 96 * c2 + 5 && 96 * c2 + 7 >= ni && c3 >= 0) || (ni >= 96 * c2 + 8 && 96 * c2 + 95 >= ni && ni % 4 >= 1)) { 97; CHECK-NEXT: if (96 * c2 + 7 >= ni) { 98; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) { 99; CHECK-NEXT: // Loop Vectorizer Disabled 100; CHECK-NEXT: // Register tiling - Points 101; CHECK-NEXT: { 102; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4, 2048 * c0 + 8 * c3, 256 * c1 + c5); 103; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); 104; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); 105; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); 106; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); 107; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); 108; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); 109; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); 110; CHECK-NEXT: if (ni >= 96 * c2 + 6) { 111; CHECK-NEXT: Stmt_for_body6(96 * c2 + 5, 2048 * c0 + 8 * c3, 256 * c1 + c5); 112; CHECK-NEXT: Stmt_for_body6(96 * c2 + 5, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); 113; CHECK-NEXT: Stmt_for_body6(96 * c2 + 5, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); 114; CHECK-NEXT: Stmt_for_body6(96 * c2 + 5, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); 115; CHECK-NEXT: Stmt_for_body6(96 * c2 + 5, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); 116; CHECK-NEXT: Stmt_for_body6(96 * c2 + 5, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); 117; CHECK-NEXT: Stmt_for_body6(96 * c2 + 5, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); 118; CHECK-NEXT: Stmt_for_body6(96 * c2 + 5, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); 119; CHECK-NEXT: if (96 * c2 + 7 == ni) { 120; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3, 256 * c1 + c5); 121; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); 122; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); 123; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); 124; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); 125; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); 126; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); 127; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); 128; CHECK-NEXT: } 129; CHECK-NEXT: } 130; CHECK-NEXT: } 131; CHECK-NEXT: } 132; CHECK-NEXT: } else { 133; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) { 134; CHECK-NEXT: // Loop Vectorizer Disabled 135; CHECK-NEXT: // Register tiling - Points 136; CHECK-NEXT: { 137; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni - 1, 2048 * c0 + 8 * c3, 256 * c1 + c5); 138; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni - 1, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); 139; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni - 1, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); 140; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni - 1, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); 141; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni - 1, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); 142; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni - 1, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); 143; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni - 1, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); 144; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni - 1, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); 145; CHECK-NEXT: if (ni % 4 >= 2) { 146; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni, 2048 * c0 + 8 * c3, 256 * c1 + c5); 147; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); 148; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); 149; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); 150; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); 151; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); 152; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); 153; CHECK-NEXT: Stmt_for_body6(-((ni - 1) % 4) + ni, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); 154; CHECK-NEXT: if ((ni + 1) % 4 == 0) { 155; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3, 256 * c1 + c5); 156; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); 157; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); 158; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); 159; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); 160; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); 161; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); 162; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); 163; CHECK-NEXT: } 164; CHECK-NEXT: } 165; CHECK-NEXT: } 166; CHECK-NEXT: } 167; CHECK-NEXT: } 168; CHECK-NEXT: } 169; CHECK-NEXT: } 170; CHECK-NEXT: if (96 * c2 + 3 >= ni || 2048 * c0 + 7 >= nj || (2048 * c0 + 2047 >= nj && nj % 8 >= 1)) { 171; CHECK-NEXT: if (2048 * c0 + 7 >= nj) { 172; CHECK-NEXT: for (int c4 = 0; c4 <= min(23, -24 * c2 + (ni - 1) / 4); c4 += 1) 173; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) { 174; CHECK-NEXT: // Loop Vectorizer Disabled 175; CHECK-NEXT: // Register tiling - Points 176; CHECK-NEXT: { 177; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0, 256 * c1 + c5); 178; CHECK-NEXT: if (nj >= 2048 * c0 + 2) { 179; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 1, 256 * c1 + c5); 180; CHECK-NEXT: if (nj >= 2048 * c0 + 3) { 181; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 2, 256 * c1 + c5); 182; CHECK-NEXT: if (nj >= 2048 * c0 + 4) { 183; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 3, 256 * c1 + c5); 184; CHECK-NEXT: if (nj >= 2048 * c0 + 5) { 185; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 4, 256 * c1 + c5); 186; CHECK-NEXT: if (nj >= 2048 * c0 + 6) { 187; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, 2048 * c0 + 5, 256 * c1 + c5); 188; CHECK-NEXT: if (2048 * c0 + 7 == nj) 189; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, nj - 1, 256 * c1 + c5); 190; CHECK-NEXT: } 191; CHECK-NEXT: } 192; CHECK-NEXT: } 193; CHECK-NEXT: } 194; CHECK-NEXT: } 195; CHECK-NEXT: if (ni >= 96 * c2 + 4 * c4 + 2) { 196; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0, 256 * c1 + c5); 197; CHECK-NEXT: if (nj >= 2048 * c0 + 2) { 198; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 1, 256 * c1 + c5); 199; CHECK-NEXT: if (nj >= 2048 * c0 + 3) { 200; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 2, 256 * c1 + c5); 201; CHECK-NEXT: if (nj >= 2048 * c0 + 4) { 202; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 3, 256 * c1 + c5); 203; CHECK-NEXT: if (nj >= 2048 * c0 + 5) { 204; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 4, 256 * c1 + c5); 205; CHECK-NEXT: if (nj >= 2048 * c0 + 6) { 206; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, 2048 * c0 + 5, 256 * c1 + c5); 207; CHECK-NEXT: if (2048 * c0 + 7 == nj) 208; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, nj - 1, 256 * c1 + c5); 209; CHECK-NEXT: } 210; CHECK-NEXT: } 211; CHECK-NEXT: } 212; CHECK-NEXT: } 213; CHECK-NEXT: } 214; CHECK-NEXT: if (ni >= 96 * c2 + 4 * c4 + 3) { 215; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0, 256 * c1 + c5); 216; CHECK-NEXT: if (nj >= 2048 * c0 + 2) { 217; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 1, 256 * c1 + c5); 218; CHECK-NEXT: if (nj >= 2048 * c0 + 3) { 219; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 2, 256 * c1 + c5); 220; CHECK-NEXT: if (nj >= 2048 * c0 + 4) { 221; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 3, 256 * c1 + c5); 222; CHECK-NEXT: if (nj >= 2048 * c0 + 5) { 223; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 4, 256 * c1 + c5); 224; CHECK-NEXT: if (nj >= 2048 * c0 + 6) { 225; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, 2048 * c0 + 5, 256 * c1 + c5); 226; CHECK-NEXT: if (2048 * c0 + 7 == nj) 227; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, nj - 1, 256 * c1 + c5); 228; CHECK-NEXT: } 229; CHECK-NEXT: } 230; CHECK-NEXT: } 231; CHECK-NEXT: } 232; CHECK-NEXT: } 233; CHECK-NEXT: if (ni >= 96 * c2 + 4 * c4 + 4) { 234; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0, 256 * c1 + c5); 235; CHECK-NEXT: if (nj >= 2048 * c0 + 2) { 236; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 1, 256 * c1 + c5); 237; CHECK-NEXT: if (nj >= 2048 * c0 + 3) { 238; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 2, 256 * c1 + c5); 239; CHECK-NEXT: if (nj >= 2048 * c0 + 4) { 240; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 3, 256 * c1 + c5); 241; CHECK-NEXT: if (nj >= 2048 * c0 + 5) { 242; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 4, 256 * c1 + c5); 243; CHECK-NEXT: if (nj >= 2048 * c0 + 6) { 244; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, 2048 * c0 + 5, 256 * c1 + c5); 245; CHECK-NEXT: if (2048 * c0 + 7 == nj) 246; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, nj - 1, 256 * c1 + c5); 247; CHECK-NEXT: } 248; CHECK-NEXT: } 249; CHECK-NEXT: } 250; CHECK-NEXT: } 251; CHECK-NEXT: } 252; CHECK-NEXT: } 253; CHECK-NEXT: } 254; CHECK-NEXT: } 255; CHECK-NEXT: } 256; CHECK-NEXT: } 257; CHECK-NEXT: } else if (96 * c2 + 3 >= ni) { 258; CHECK-NEXT: for (int c3 = 0; c3 <= min(255, -256 * c0 + (nj - 1) / 8); c3 += 1) 259; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) { 260; CHECK-NEXT: // Loop Vectorizer Disabled 261; CHECK-NEXT: // Register tiling - Points 262; CHECK-NEXT: { 263; CHECK-NEXT: Stmt_for_body6(96 * c2, 2048 * c0 + 8 * c3, 256 * c1 + c5); 264; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 2) { 265; CHECK-NEXT: Stmt_for_body6(96 * c2, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); 266; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 3) { 267; CHECK-NEXT: Stmt_for_body6(96 * c2, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); 268; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 4) { 269; CHECK-NEXT: Stmt_for_body6(96 * c2, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); 270; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 5) { 271; CHECK-NEXT: Stmt_for_body6(96 * c2, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); 272; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 6) { 273; CHECK-NEXT: Stmt_for_body6(96 * c2, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); 274; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 7) { 275; CHECK-NEXT: Stmt_for_body6(96 * c2, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); 276; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 8) 277; CHECK-NEXT: Stmt_for_body6(96 * c2, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); 278; CHECK-NEXT: } 279; CHECK-NEXT: } 280; CHECK-NEXT: } 281; CHECK-NEXT: } 282; CHECK-NEXT: } 283; CHECK-NEXT: } 284; CHECK-NEXT: if (ni >= 96 * c2 + 2) { 285; CHECK-NEXT: Stmt_for_body6(96 * c2 + 1, 2048 * c0 + 8 * c3, 256 * c1 + c5); 286; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 2) { 287; CHECK-NEXT: Stmt_for_body6(96 * c2 + 1, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); 288; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 3) { 289; CHECK-NEXT: Stmt_for_body6(96 * c2 + 1, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); 290; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 4) { 291; CHECK-NEXT: Stmt_for_body6(96 * c2 + 1, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); 292; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 5) { 293; CHECK-NEXT: Stmt_for_body6(96 * c2 + 1, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); 294; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 6) { 295; CHECK-NEXT: Stmt_for_body6(96 * c2 + 1, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); 296; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 7) { 297; CHECK-NEXT: Stmt_for_body6(96 * c2 + 1, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); 298; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 8) 299; CHECK-NEXT: Stmt_for_body6(96 * c2 + 1, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); 300; CHECK-NEXT: } 301; CHECK-NEXT: } 302; CHECK-NEXT: } 303; CHECK-NEXT: } 304; CHECK-NEXT: } 305; CHECK-NEXT: } 306; CHECK-NEXT: if (96 * c2 + 3 == ni) { 307; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3, 256 * c1 + c5); 308; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 2) { 309; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 1, 256 * c1 + c5); 310; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 3) { 311; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 2, 256 * c1 + c5); 312; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 4) { 313; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 3, 256 * c1 + c5); 314; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 5) { 315; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 4, 256 * c1 + c5); 316; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 6) { 317; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 5, 256 * c1 + c5); 318; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 7) { 319; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 6, 256 * c1 + c5); 320; CHECK-NEXT: if (nj >= 2048 * c0 + 8 * c3 + 8) 321; CHECK-NEXT: Stmt_for_body6(ni - 1, 2048 * c0 + 8 * c3 + 7, 256 * c1 + c5); 322; CHECK-NEXT: } 323; CHECK-NEXT: } 324; CHECK-NEXT: } 325; CHECK-NEXT: } 326; CHECK-NEXT: } 327; CHECK-NEXT: } 328; CHECK-NEXT: } 329; CHECK-NEXT: } 330; CHECK-NEXT: } 331; CHECK-NEXT: } 332; CHECK-NEXT: } else { 333; CHECK-NEXT: for (int c4 = 0; c4 <= min(23, -24 * c2 + (ni - 1) / 4); c4 += 1) 334; CHECK-NEXT: for (int c5 = 0; c5 <= min(255, nk - 256 * c1 - 1); c5 += 1) { 335; CHECK-NEXT: // Loop Vectorizer Disabled 336; CHECK-NEXT: // Register tiling - Points 337; CHECK-NEXT: { 338; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, -((nj - 1) % 8) + nj - 1, 256 * c1 + c5); 339; CHECK-NEXT: if (nj % 8 >= 2) { 340; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, -((nj - 1) % 8) + nj, 256 * c1 + c5); 341; CHECK-NEXT: if (nj % 8 >= 3) { 342; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, -((nj - 1) % 8) + nj + 1, 256 * c1 + c5); 343; CHECK-NEXT: if (nj % 8 >= 4) { 344; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, -((nj - 1) % 8) + nj + 2, 256 * c1 + c5); 345; CHECK-NEXT: if (nj % 8 >= 5) { 346; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, -((nj - 1) % 8) + nj + 3, 256 * c1 + c5); 347; CHECK-NEXT: if (nj % 8 >= 6) { 348; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, -((nj - 1) % 8) + nj + 4, 256 * c1 + c5); 349; CHECK-NEXT: if ((nj + 1) % 8 == 0) 350; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4, nj - 1, 256 * c1 + c5); 351; CHECK-NEXT: } 352; CHECK-NEXT: } 353; CHECK-NEXT: } 354; CHECK-NEXT: } 355; CHECK-NEXT: } 356; CHECK-NEXT: if (ni >= 96 * c2 + 4 * c4 + 2) { 357; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, -((nj - 1) % 8) + nj - 1, 256 * c1 + c5); 358; CHECK-NEXT: if (nj % 8 >= 2) { 359; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, -((nj - 1) % 8) + nj, 256 * c1 + c5); 360; CHECK-NEXT: if (nj % 8 >= 3) { 361; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, -((nj - 1) % 8) + nj + 1, 256 * c1 + c5); 362; CHECK-NEXT: if (nj % 8 >= 4) { 363; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, -((nj - 1) % 8) + nj + 2, 256 * c1 + c5); 364; CHECK-NEXT: if (nj % 8 >= 5) { 365; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, -((nj - 1) % 8) + nj + 3, 256 * c1 + c5); 366; CHECK-NEXT: if (nj % 8 >= 6) { 367; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, -((nj - 1) % 8) + nj + 4, 256 * c1 + c5); 368; CHECK-NEXT: if ((nj + 1) % 8 == 0) 369; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 1, nj - 1, 256 * c1 + c5); 370; CHECK-NEXT: } 371; CHECK-NEXT: } 372; CHECK-NEXT: } 373; CHECK-NEXT: } 374; CHECK-NEXT: } 375; CHECK-NEXT: if (ni >= 96 * c2 + 4 * c4 + 3) { 376; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, -((nj - 1) % 8) + nj - 1, 256 * c1 + c5); 377; CHECK-NEXT: if (nj % 8 >= 2) { 378; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, -((nj - 1) % 8) + nj, 256 * c1 + c5); 379; CHECK-NEXT: if (nj % 8 >= 3) { 380; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, -((nj - 1) % 8) + nj + 1, 256 * c1 + c5); 381; CHECK-NEXT: if (nj % 8 >= 4) { 382; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, -((nj - 1) % 8) + nj + 2, 256 * c1 + c5); 383; CHECK-NEXT: if (nj % 8 >= 5) { 384; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, -((nj - 1) % 8) + nj + 3, 256 * c1 + c5); 385; CHECK-NEXT: if (nj % 8 >= 6) { 386; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, -((nj - 1) % 8) + nj + 4, 256 * c1 + c5); 387; CHECK-NEXT: if ((nj + 1) % 8 == 0) 388; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 2, nj - 1, 256 * c1 + c5); 389; CHECK-NEXT: } 390; CHECK-NEXT: } 391; CHECK-NEXT: } 392; CHECK-NEXT: } 393; CHECK-NEXT: } 394; CHECK-NEXT: if (ni >= 96 * c2 + 4 * c4 + 4) { 395; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, -((nj - 1) % 8) + nj - 1, 256 * c1 + c5); 396; CHECK-NEXT: if (nj % 8 >= 2) { 397; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, -((nj - 1) % 8) + nj, 256 * c1 + c5); 398; CHECK-NEXT: if (nj % 8 >= 3) { 399; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, -((nj - 1) % 8) + nj + 1, 256 * c1 + c5); 400; CHECK-NEXT: if (nj % 8 >= 4) { 401; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, -((nj - 1) % 8) + nj + 2, 256 * c1 + c5); 402; CHECK-NEXT: if (nj % 8 >= 5) { 403; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, -((nj - 1) % 8) + nj + 3, 256 * c1 + c5); 404; CHECK-NEXT: if (nj % 8 >= 6) { 405; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, -((nj - 1) % 8) + nj + 4, 256 * c1 + c5); 406; CHECK-NEXT: if ((nj + 1) % 8 == 0) 407; CHECK-NEXT: Stmt_for_body6(96 * c2 + 4 * c4 + 3, nj - 1, 256 * c1 + c5); 408; CHECK-NEXT: } 409; CHECK-NEXT: } 410; CHECK-NEXT: } 411; CHECK-NEXT: } 412; CHECK-NEXT: } 413; CHECK-NEXT: } 414; CHECK-NEXT: } 415; CHECK-NEXT: } 416; CHECK-NEXT: } 417; CHECK-NEXT: } 418; CHECK-NEXT: } 419; CHECK-NEXT: } 420; CHECK-NEXT: } 421; CHECK-NEXT: } 422; CHECK-NEXT: } 423; CHECK-NEXT: } 424; 425 426; AUTO-VECTORIZATION: fmul <4 x double> 427; AUTO-VECTORIZATION: fadd <4 x double> 428 429; AUTO-VECTORIZATION: 36 SLP - Number of vector instructions generated 430; AUTO-VECTORIZATION: 453 licm - Number of instructions hoisted out of loop 431; AUTO-VECTORIZATION: 2 licm - Number of load insts hoisted or sunk 432; AUTO-VECTORIZATION: 32 licm - Number of memory locations promoted to registers 433; 434target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 435target triple = "x86_64-unknown-unknown" 436 437define internal void @kernel_gemm(i32 %ni, i32 %nj, i32 %nk, double %alpha, double %beta, [1024 x double]* %C, [1024 x double]* %A, [1024 x double]* %B) #0 { 438entry: 439 br label %entry.split 440 441entry.split: ; preds = %entry 442 %cmp39 = icmp sgt i32 %ni, 0 443 br i1 %cmp39, label %for.cond1.preheader.lr.ph, label %for.end22 444 445for.cond1.preheader.lr.ph: ; preds = %entry.split 446 br label %for.cond1.preheader 447 448for.cond1.preheader: ; preds = %for.inc20, %for.cond1.preheader.lr.ph 449 %indvars.iv45 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next46, %for.inc20 ] 450 %cmp237 = icmp sgt i32 %nj, 0 451 br i1 %cmp237, label %for.cond4.preheader.lr.ph, label %for.inc20 452 453for.cond4.preheader.lr.ph: ; preds = %for.cond1.preheader 454 br label %for.cond4.preheader 455 456for.cond4.preheader: ; preds = %for.inc17, %for.cond4.preheader.lr.ph 457 %indvars.iv41 = phi i64 [ 0, %for.cond4.preheader.lr.ph ], [ %indvars.iv.next42, %for.inc17 ] 458 %cmp535 = icmp sgt i32 %nk, 0 459 br i1 %cmp535, label %for.body6.lr.ph, label %for.inc17 460 461for.body6.lr.ph: ; preds = %for.cond4.preheader 462 br label %for.body6 463 464for.body6: ; preds = %for.body6, %for.body6.lr.ph 465 %indvars.iv = phi i64 [ 0, %for.body6.lr.ph ], [ %indvars.iv.next, %for.body6 ] 466 %arrayidx8 = getelementptr inbounds [1024 x double], [1024 x double]* %A, i64 %indvars.iv45, i64 %indvars.iv 467 %tmp = load double, double* %arrayidx8, align 8 468 %arrayidx12 = getelementptr inbounds [1024 x double], [1024 x double]* %B, i64 %indvars.iv, i64 %indvars.iv41 469 %tmp1 = load double, double* %arrayidx12, align 8 470 %mul = fmul double %tmp, %tmp1 471 %arrayidx16 = getelementptr inbounds [1024 x double], [1024 x double]* %C, i64 %indvars.iv45, i64 %indvars.iv41 472 %tmp2 = load double, double* %arrayidx16, align 8 473 %add = fadd double %tmp2, %mul 474 store double %add, double* %arrayidx16, align 8 475 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 476 %wide.trip.count = zext i32 %nk to i64 477 %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count 478 br i1 %exitcond, label %for.body6, label %for.cond4.for.inc17_crit_edge 479 480for.cond4.for.inc17_crit_edge: ; preds = %for.body6 481 br label %for.inc17 482 483for.inc17: ; preds = %for.cond4.for.inc17_crit_edge, %for.cond4.preheader 484 %indvars.iv.next42 = add nuw nsw i64 %indvars.iv41, 1 485 %wide.trip.count43 = zext i32 %nj to i64 486 %exitcond44 = icmp ne i64 %indvars.iv.next42, %wide.trip.count43 487 br i1 %exitcond44, label %for.cond4.preheader, label %for.cond1.for.inc20_crit_edge 488 489for.cond1.for.inc20_crit_edge: ; preds = %for.inc17 490 br label %for.inc20 491 492for.inc20: ; preds = %for.cond1.for.inc20_crit_edge, %for.cond1.preheader 493 %indvars.iv.next46 = add nuw nsw i64 %indvars.iv45, 1 494 %wide.trip.count47 = zext i32 %ni to i64 495 %exitcond48 = icmp ne i64 %indvars.iv.next46, %wide.trip.count47 496 br i1 %exitcond48, label %for.cond1.preheader, label %for.cond.for.end22_crit_edge 497 498for.cond.for.end22_crit_edge: ; preds = %for.inc20 499 br label %for.end22 500 501for.end22: ; preds = %for.cond.for.end22_crit_edge, %entry.split 502 ret void 503} 504 505attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+aes,+avx,+cmov,+cx16,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" } 506