1#! /usr/local/bin/apl --script 2 3 ⍝ tunable parameters for this benchmark program 4 ⍝ 5 DO_PLOT←0 ⍝ do/don't plot the results of start-up cost 6 ILRC←1000 ⍝ repeat count for the inner loop of start-up cost 7 LEN_PI←1000000 ⍝ vector length for measuring the per-item cost 8 PROFILE←4000 2000 50 ⍝ fractions of Integer, Real, and Complex numbers 9 CORES←3 ⍝ number of cores used for parallel execution 10 TIME_LIMIT←2000 ⍝ time limit per pass (milliseconds) 11 12)COPY 5 FILE_IO 13 14 ⍝ expressions to be benchmarked 15 ⍝ 16∇Z←MON_EXPR 17 Z←⍬ 18 ⍝ A OP B N CN STAT 19 ⍝------------------------------------------------------- 20 Z←Z,⊂ "" "+" "Mix_IRC" 1 "F12_PLUS" 35 21 Z←Z,⊂ "" "-" "Mix_IRC" 1 "F12_MINUS" 35 22 Z←Z,⊂ "" "×" "Mix_IRC" 1 "F12_TIMES" 35 23 Z←Z,⊂ "" "÷" "Mix1_IRC" 1 "F12_DIVIDE" 35 24 Z←Z,⊂ "" "∼" "Bool" 1 "F12_WITHOUT" 35 25 Z←Z,⊂ "" "⌈" "Mix_IR" 1 "F12_RND_UP" 35 26 Z←Z,⊂ "" "⌊" "Mix_IR" 1 "F12_RND_DN" 35 27 Z←Z,⊂ "" "!" "Int2" 1 "F12_BINOM" 35 28 Z←Z,⊂ "" "⋆" "Mix_IRC" 1 "F12_POWER" 35 29 Z←Z,⊂ "" "⍟" "Mix1_IRC" 1 "F12_LOGA" 35 30 Z←Z,⊂ "" "○" "Mix_IRC" 1 "F12_CIRCLE" 35 31 Z←Z,⊂ "" "∣" "Mix_IR" 1 "F12_STILE" 35 32 Z←Z,⊂ "" "?" "Int2" 1 "F12_ROLL" 35 33∇ 34 35∇Z←DYA_EXPR 36 Z←⍬ 37 ⍝ A OP B N CN STAT 38 ⍝------------------------------------------------------- 39 Z←Z,⊂ "Mix_IRC" "+" "Mix1_IRC" 2 "F12_PLUS" 36 40 Z←Z,⊂ "Mix_IRC" "-" "Mix1_IRC" 2 "F12_MINUS" 36 41 Z←Z,⊂ "Mix_IRC" "×" "Mix1_IRC" 2 "F12_TIMES" 36 42 Z←Z,⊂ "Mix1_IRC" "÷" "Mix1_IRC" 2 "F12_DIVIDE" 36 43 Z←Z,⊂ "Bool" "∧" "Bool1" 2 "F2_AND" 36 44 Z←Z,⊂ "Bool" "∨" "Bool1" 2 "F2_OR" 36 45 Z←Z,⊂ "Bool" "⍲" "Bool1" 2 "F2_NAND" 36 46 Z←Z,⊂ "Bool" "⍱" "Bool1" 2 "F2_NOR" 36 47 Z←Z,⊂ "Mix_IR" "⌈" "Mix_IR" 2 "F12_RND_UP" 36 48 Z←Z,⊂ "Mix_IR" "⌊" "Mix_IR" 2 "F12_RND_DN" 36 49 Z←Z,⊂ "Mix_IRC" "!" "Mix_IRC" 2 "F12_BINOM" 36 50 Z←Z,⊂ "Mix_IRC" "⋆" "Mix_IRC" 2 "F12_POWER" 36 51 Z←Z,⊂ "Mix1_IRC" "⍟" "Mix1_IRC" 2 "F12_LOGA" 36 52 Z←Z,⊂ "Mix_IR " "<" "Mix_IR" 2 "F2_LESS" 36 53 Z←Z,⊂ "Mix_IR " "≤" "Mix_IR" 2 "F2_LEQ" 36 54 Z←Z,⊂ "Mix_IRC" "=" "Mix_IRC" 2 "F2_EQUAL" 36 55 Z←Z,⊂ "Mix_IRC" "≠" "Mix_IRC" 2 "F2_UNEQ" 36 56 Z←Z,⊂ "Mix_IR" ">" "Mix_IR" 2 "F2_GREATER" 36 57 Z←Z,⊂ "Mix_IR" "≥" "Mix_IR" 2 "F2_MEQ" 36 58 Z←Z,⊂ "1" "○" "Mix_IRC" 2 "F12_CIRCLE" 36 59 Z←Z,⊂ "Mix_IRC" "∣" "Mix_IRC" 2 "F12_STILE" 36 60 Z←Z,⊂ "1 2 3" "⋸" "Int" 2 "F12_FIND" 36 61 Z←Z,⊂ "Mat1_IRC" "+.×" "Mat1_IRC" 3 "OPER2_INNER" 38 62 Z←Z,⊂ "Vec1_IRC" "∘.×" "Vec1_IRC" 3 "OPER2_OUTER" 39 63∇ 64 65∇INIT_DATA LEN;N;Ilen;Rlen;Clen 66 ⍝⍝ 67 ⍝⍝ setup variables used in benchmark expressions: 68 ⍝⍝ Int: ¯2 ... 9 69 ⍝⍝ Int1: nonzero Int 70 ⍝⍝ Real: ¯10 to 10 or so 71 ⍝⍝ 72 (Ilen Rlen Clen)←PROFILE 73 Int ← 10 - ? Ilen ⍴ 12 74 Int1 ← Ilen ⍴ (Int≠0)/Int 75 Int2 ← Ilen ⍴ (Int>0) / Int 76 Bool ← 2 ∣ Int 77 Bool1← 1 ⌽ Bool 78 Real ← Rlen ⍴ Int + 3 ÷ ○1 79 Real1← Rlen ⍴ (Real≠0)/Real 80 Real2← Rlen ⍴ (Real>0)/Real 81 Comp ← Clen ⍴ Real + 0J1×1⌽Real 82 Comp1← Clen ⍴ (Comp≠0)/Comp 83 84 Mix_IR ←Int,Real ◊ Mix_IR [N?N←⍴Mix_IR ] ← Mix_IR 85 Mix_IRC ←Int,Real,Comp ◊ Mix_IRC [N?N←⍴Mix_IRC ] ← Mix_IRC 86 Mix1_IRC ←Int1,Real1,Comp1 ◊ Mix1_IRC [N?N←⍴Mix1_IRC] ← Mix1_IRC 87 88 Int ← LEN ⍴ Int 89 Int1 ← LEN ⍴ Int1 90 Int2 ← LEN ⍴ Int2 91 Bool ← LEN ⍴ Bool 92 Bool1 ← LEN ⍴ Bool1 93 Real ← LEN ⍴ Real 94 Real1 ← LEN ⍴ Real1 95 Real2 ← LEN ⍴ Real2 96 Comp ← LEN ⍴ Comp 97 Comp1 ← LEN ⍴ Comp1 98 Mix_IR ← LEN ⍴ Mix_IR 99 Mix_IRC ← LEN ⍴ Mix_IRC 100 Mix1_IRC ← LEN ⍴ Mix1_IRC 101 Mat1_IRC ← (2⍴⌈LEN⋆0.35)⍴Mix1_IRC 102 Vec1_IRC ← (⌈LEN⋆0.5)⍴Mix1_IRC 103∇ 104 105'libaplplot' ⎕FX 'PLOT' 106 107∇EXPR PLOT_P DATA;PLOTARG 108 ⍝⍝ 109 ⍝⍝ plot data if enabled by DO_PLOT 110 ⍝⍝ 111 →DO_PLOT↓0 112 PLOTARG←'xcol 0;' 113 PLOTARG←PLOTARG,'xlabel "result length";' 114 PLOTARG←PLOTARG,'ylabel "CPU cycles";' 115 PLOTARG←PLOTARG,'draw l;' 116 PLOTARG←PLOTARG,'plwindow ' , TITLE EXPR 117 ⊣ PLOTARG PLOT DATA 118 ⍞ 119∇ 120 121∇Z←Average[X] B 122 ⍝⍝ return the average of B along axis X 123 Z←(+/[X]B) ÷ (⍴B)[X] 124∇ 125 126∇Z←TITLE EXPR;A;OP;B 127 (A OP B)←3↑EXPR 128 Z←OP, ' ', B 129 →(0=⍴A)/0 130 Z←A,' ',Z 131∇ 132 133∇Z←TITLE1 EXPR;A;OP;B;Z1 134 (A OP B)←3↑EXPR 135 Z←OP, ' B"' ◊ Z1←'"' 136 →(0=⍴A)/1+↑⎕LC ◊ Z1←'"A ' 137 Z←Z1,Z 138∇ 139 140∇Z←X LSQRL Y;N;XY;XX;Zb;Za;SX;SXX;SY;SXY 141 ⍝⍝ return the least square regression line (a line a + b×N with minimal 142 ⍝⍝ distance from samples Y(X)) 143 N←⍴X 144 XY←X×Y ◊ XX←X×X 145 SX←+/X ◊ SY←+/Y ◊ SXY←+/XY ◊ SXX←+/XX 146 Zb←( (N×SXY) - SX×SY ) ÷ ((N×SXX) - SX×SX) 147 Za←(SY - Zb×SX) ÷ N 148 Z←Za, Zb 149∇ 150 151 ⍝ ---------------------------------------------------- 152 ⍝ Run one pass (one length), return average cycles 153 ⍝ 154∇Z←ONE_PASS EXPR;OP;STAT;I;ZZ;TH1;TH2;CYCLES;T0;T1 155 OP←⊃EXPR[2] 156 STAT←EXPR[6] 157 TH1← 1 FIO∆set_monadic_threshold OP 158 TH2← 1 FIO∆set_dyadic_threshold OP 159 160 I←0 161 ZZ←⍬ 162 T0←24 60 60 1000⊥¯4↑⎕TS 163L: 164 FIO∆clear_statistics STAT 165 Q←⍎TITLE EXPR 166 CYCLES←(FIO∆get_statistics STAT)[4] 167 ZZ←ZZ,CYCLES 168 T1←24 60 60 1000⊥¯4↑⎕TS 169 →((I≥2) ∧ TIME_LIMIT<T1-T0)⍴DONE ⍝ don't let it run too long 170 →(ILRC≥I←I+1)/L 171DONE: 172 173 ⍝ restore thresholds 174 ⊣ TH1 FIO∆set_monadic_threshold OP 175 ⊣ TH2 FIO∆set_dyadic_threshold OP 176 177 ⍝ ignore the first 2 measurements as cache warm-up 178 ⍝ 179 ZZ←2↓ZZ 180⍝ Z←(⍴,Q), ⌊ Average[1]ZZ 181 Z←(⍴,Q), ⌊ ⌊⌿ZZ 182∇ 183 184 ⍝ ---------------------------------------------------- 185 ⍝ figure start-up times for sequential and parallel execution. 186 ⍝ We use small vector sizes for better precision 187 ⍝ 188∇Z←FIGURE_A EXPR;LENGTHS;I;LEN;ZS;ZP;SA;SB;PA;PB;H1;H2;P;TXT 189 TXT←78↑' ===================== ', (TITLE EXPR), ' ', 80⍴ '=' 190 '' ◊ TXT ◊ '' 191 Z←0 3⍴0 192 LL←⍴LENGTHS←⌽⍳20 ⍝ outer loop vector lengths 193 'Benchmarking start-up cost for ', (TITLE EXPR), ' ...' 194 195 I←1 ◊ ZS←0 2⍴0 196 ⎕SYL[26;2] ← 0 ⍝ sequential 197LS: INIT_DATA LEN←LENGTHS[I] 198 ZS←ZS⍪ONE_PASS EXPR 199 →(LL≥I←I+1)⍴LS 200 201 I←1 ◊ ZP←0 2⍴0 202 ⎕SYL[26;2] ← CORES ⍝ parallel 203LP: INIT_DATA LEN←LENGTHS[I] 204 ZP←ZP⍪ONE_PASS EXPR 205 →(LL≥I←I+1)⍴LP 206 207 (SA SB)←⌊ ZS[;1] LSQRL ZS[;2] 208 (PA PB)←⌊ ZP[;1] LSQRL ZP[;2] 209 210 ⍝ print and plot result 211 ⍝ 212 P←ZS,ZP[;2] ⍝ sequential and parallel cycles 213 P←P,(SA+ZS[;1]×SB) ⍝ sequential least square regression line 214 P←P,(PA+ZP[;1]×PB) ⍝ parallel least square regression line 215 H1←'Length' ' Sequ Cycles' ' Para Cycles' ' Linear Sequ' 'Linear Para' 216 H2←'======' ' ===========' ' ===========' ' ===========' '===========' 217 H1⍪H2⍪P 218 219 '' 220 'regression line sequential: ', (¯8↑⍕SA), ' + ', (⍕SB),'×N cycles' 221 'regression line parallel: ', (¯8↑⍕PA), ' + ', (⍕PB),'×N cycles' 222 223 ⍝ xdomain of aplplot seems not to work for xy plots - create a dummy x=0 line 224 P←(0, SA, PA, SA, PA)⍪P 225 226 EXPR PLOT_P P 227 Z←SA,PA 228∇ 229 230 ⍝ ---------------------------------------------------- 231 ⍝ figure per-item times for sequential and parallel execution. 232 ⍝ We use one LARGE vector 233 ⍝ 234∇Z←SUP_A FIGURE_B EXPR;SOFF;POFF;SCYC;PCYC;LEN 235 (SOFF POFF)←SUP_A 236 'Benchmarking per-item cost for ', (TITLE EXPR), ' ...' 237 SUMMARY←SUMMARY,⊂'-------------- ', (TITLE EXPR), ' -------------- ' 238 SUMMARY←SUMMARY,⊂'average sequential startup cost:', (¯8↑⍕⌈SOFF), ' cycles' 239 SUMMARY←SUMMARY,⊂'average parallel startup cost: ', (¯8↑⍕⌈POFF), ' cycles' 240 241 INIT_DATA LEN_PI 242 ⎕SYL[26;2] ← 0 ⍝ sequential 243 (LEN SCYC)←ONE_PASS EXPR 244 ⎕SYL[26;2] ← CORES ⍝ parallel 245 (LEN PCYC)←ONE_PASS EXPR 246 Z←⊂TITLE EXPR 247 Z←Z, ⌈ (SCYC - SOFF) ÷ LEN 248 Z←Z, ⌈ (PCYC - POFF) ÷ LEN 249 TS←'per item cost sequential: ',(¯8↑⍕Z[2]), ' cycles' 250 TP←'per item cost parallel: ',(¯8↑⍕Z[3]), ' cycles' 251 SUMMARY←SUMMARY,(⊂TS),(⊂TP) 252 253 SUP_A BREAK_EVEN (⊂EXPR),Z 254∇ 255 256∇SUP BREAK_EVEN PERI;EXPR;OP;ICS;ICP;SUPS;SUPP;T1;T2;BE;OUT 257 (SUPS SUPP)←SUP ⍝ start-up cost 258 (EXPR OP ICS ICP)←PERI ⍝ per-item cost 259 T1←'parallel break-even length: ' 260 T2←' not reached' ◊ T3←'8888888888888888888ULL' 261 →(ICP ≥ ICS)⍴1+↑⎕LC ◊ T2←¯8↑BE←⍕⌈ (SUPP - SUPS) ÷ ICS - ICP ◊ T3←21↑BE 262 SUMMARY←SUMMARY,(⊂T1,T2),⊂'' 263 264 OUT←'perfo_',(⍕EXPR[4]) 265 OUT←OUT, 16↑'(',(⊃EXPR[5]),',' 266 OUT←OUT, 6↑'_',((-1+0<⍴⊃EXPR[1])↑'AB'),',' 267 OUT←OUT, 10↑(TITLE1 EXPR),',' 268 OUT←OUT, T3,')',⎕UCS ,10 269 ⊣ OUT FIO∆fwrite_utf8 TH_FILE 270∇ 271 272 ⍝ ---------------------------------------------------- 273 274∇GO;DYA_A;MON_A;SUMMARY;TH_FILE 275 CORES←CORES ⌊ ⎕SYL[25;2] 276 'Running ScalarBenchmark_2 with' CORES 'cores...' 277 278 ⍝ check that the core count can be set 279 ⍝ 280 ⎕SYL[26;2] ← CORES 281 →(CORES = ⎕SYL[26;2])⍴CORES_OK 282 '*** CPU core count could not be set!' 283 '*** This is usually a configuration or platform problem.' 284 '***' 285 '*** try "make parallel1" in the top-level directory' 286 '***' 287 '*** the relevant ./configure options (used by make parallel1) are:' 288 '*** PERFORMANCE_COUNTERS_WANTED=yes' 289 '*** CORE_COUNT_WANTED=SYL' 290 '***' 291 '*** NOTE: parallel GNU APL currently requires linux and a recent Intel CPU' 292 '***' 293 →0 294 295CORES_OK: 296 ⎕SYL[26;2] ← 0 297 298 ⍝ figure start-up costs 299 ⍝ 300 MON_A←Average[1] ⊃ FIGURE_A ¨ MON_EXPR 301 DYA_A←Average[1] ⊃ FIGURE_A ¨ DYA_EXPR 302 303 ⍝ figure per-item costs. we can do that only after computing MON_A/DYA_A 304 ⍝ 305 '' 306 SUMMARY←0⍴'' 307 TH_FILE←"w" FIO∆fopen "parallel_thresholds" 308 309 ⊣ "\n" FIO∆fwrite_utf8 TH_FILE 310 311 ⊣ (⊂MON_A) FIGURE_B ¨ MON_EXPR 312 313 ⊣ "\n" FIO∆fwrite_utf8 TH_FILE 314 315 ⊣ (⊂DYA_A) FIGURE_B ¨ DYA_EXPR 316 317 ⊣ "\n" FIO∆fwrite_utf8 TH_FILE 318 ⊣ "#undef perfo_1\n" FIO∆fwrite_utf8 TH_FILE 319 ⊣ "#undef perfo_2\n" FIO∆fwrite_utf8 TH_FILE 320 ⊣ "#undef perfo_3\n" FIO∆fwrite_utf8 TH_FILE 321 ⊣ "\n" FIO∆fwrite_utf8 TH_FILE 322 323 ⊣ FIO∆fclose TH_FILE 324 325 '' 326 78↑' ============================ SUMMARY ',80⍴'=' 327 '' 328 ⊣ { ⎕←⍵ }¨SUMMARY 329∇ 330 331 332 GO 333 334 ]PSTAT 335 )OFF 336 337