1 /* 2 * Copyright 2019 Google LLC 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "include/core/SkStream.h" 9 #include "include/core/SkString.h" 10 #include "include/private/SkChecksum.h" 11 #include "include/private/SkSpinlock.h" 12 #include "include/private/SkTFitsIn.h" 13 #include "include/private/SkThreadID.h" 14 #include "include/private/SkVx.h" 15 #include "src/core/SkColorSpaceXformSteps.h" 16 #include "src/core/SkCpu.h" 17 #include "src/core/SkOpts.h" 18 #include "src/core/SkVM.h" 19 #include <algorithm> 20 #include <atomic> 21 #include <queue> 22 23 #if defined(SKVM_LLVM) 24 #include <future> 25 #include <llvm/Bitcode/BitcodeWriter.h> 26 #include <llvm/ExecutionEngine/ExecutionEngine.h> 27 #include <llvm/IR/IRBuilder.h> 28 #include <llvm/IR/Verifier.h> 29 #include <llvm/Support/TargetSelect.h> 30 #endif 31 32 bool gSkVMJITViaDylib{false}; 33 34 // JIT code isn't MSAN-instrumented, so we won't see when it uses 35 // uninitialized memory, and we'll not see the writes it makes as properly 36 // initializing memory. Instead force the interpreter, which should let 37 // MSAN see everything our programs do properly. 38 // 39 // Similarly, we can't get ASAN's checks unless we let it instrument our interpreter. 40 #if defined(__has_feature) 41 #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer) 42 #undef SKVM_JIT 43 #endif 44 #endif 45 46 #if defined(SKVM_JIT) 47 #include <dlfcn.h> // dlopen, dlsym 48 #include <sys/mman.h> // mmap, mprotect 49 #endif 50 51 namespace skvm { 52 53 struct Program::Impl { 54 std::vector<InterpreterInstruction> instructions; 55 int regs = 0; 56 int loop = 0; 57 std::vector<int> strides; 58 59 std::atomic<void*> jit_entry{nullptr}; // TODO: minimal std::memory_orders 60 size_t jit_size = 0; 61 void* dylib = nullptr; 62 63 #if defined(SKVM_LLVM) 64 std::unique_ptr<llvm::LLVMContext> llvm_ctx; 65 std::unique_ptr<llvm::ExecutionEngine> llvm_ee; 66 std::future<void> llvm_compiling; 67 #endif 68 }; 69 70 // Debugging tools, mostly for printing various data structures out to a stream. 71 72 namespace { 73 class SkDebugfStream final : public SkWStream { 74 size_t fBytesWritten = 0; 75 write(const void * buffer,size_t size)76 bool write(const void* buffer, size_t size) override { 77 SkDebugf("%.*s", size, buffer); 78 fBytesWritten += size; 79 return true; 80 } 81 bytesWritten() const82 size_t bytesWritten() const override { 83 return fBytesWritten; 84 } 85 }; 86 87 struct V { Val id; }; 88 struct R { Reg id; }; 89 struct Shift { int bits; }; 90 struct Splat { int bits; }; 91 struct Hex { int bits; }; 92 write(SkWStream * o,const char * s)93 static void write(SkWStream* o, const char* s) { 94 o->writeText(s); 95 } 96 name(Op op)97 static const char* name(Op op) { 98 switch (op) { 99 #define M(x) case Op::x: return #x; 100 SKVM_OPS(M) 101 #undef M 102 } 103 return "unknown op"; 104 } 105 write(SkWStream * o,Op op)106 static void write(SkWStream* o, Op op) { 107 const char* raw = name(op); 108 if (const char* found = strstr(raw, "_imm")) { 109 o->write(raw, found-raw); 110 } else { 111 o->writeText(raw); 112 } 113 } write(SkWStream * o,Arg a)114 static void write(SkWStream* o, Arg a) { 115 write(o, "arg("); 116 o->writeDecAsText(a.ix); 117 write(o, ")"); 118 } write(SkWStream * o,V v)119 static void write(SkWStream* o, V v) { 120 write(o, "v"); 121 o->writeDecAsText(v.id); 122 } write(SkWStream * o,R r)123 static void write(SkWStream* o, R r) { 124 write(o, "r"); 125 o->writeDecAsText(r.id); 126 } write(SkWStream * o,Shift s)127 static void write(SkWStream* o, Shift s) { 128 o->writeDecAsText(s.bits); 129 } write(SkWStream * o,Splat s)130 static void write(SkWStream* o, Splat s) { 131 float f; 132 memcpy(&f, &s.bits, 4); 133 o->writeHexAsText(s.bits); 134 write(o, " ("); 135 o->writeScalarAsText(f); 136 write(o, ")"); 137 } write(SkWStream * o,Hex h)138 static void write(SkWStream* o, Hex h) { 139 o->writeHexAsText(h.bits); 140 } 141 142 template <typename T, typename... Ts> write(SkWStream * o,T first,Ts...rest)143 static void write(SkWStream* o, T first, Ts... rest) { 144 write(o, first); 145 write(o, " "); 146 write(o, rest...); 147 } 148 } 149 dot(SkWStream * o,bool for_jit) const150 void Builder::dot(SkWStream* o, bool for_jit) const { 151 SkDebugfStream debug; 152 if (!o) { o = &debug; } 153 154 std::vector<OptimizedInstruction> optimized = this->optimize(for_jit); 155 156 o->writeText("digraph {\n"); 157 for (Val id = 0; id < (Val)optimized.size(); id++) { 158 const OptimizedInstruction& i = optimized[id]; 159 160 switch (i.op) { 161 default: 162 write(o, "\t", V{id}, " [label = \"", V{id}, i.op); 163 // Not a perfect heuristic; sometimes y/z == NA and there is no immy/z. 164 // On the other hand, sometimes immy/z=0 is meaningful and should be printed. 165 if (i.y == NA) { write(o, "", Hex{i.immy}); } 166 if (i.z == NA) { write(o, "", Hex{i.immz}); } 167 write(o, "\"]\n"); 168 169 write(o, "\t", V{id}, " -> {"); 170 // In contrast to the heuristic imm labels, these dependences are exact. 171 if (i.x != NA) { write(o, "", V{i.x}); } 172 if (i.y != NA) { write(o, "", V{i.y}); } 173 if (i.z != NA) { write(o, "", V{i.z}); } 174 write(o, " }\n"); 175 176 break; 177 178 // That default: impl works pretty well for most instructions, 179 // but some are nicer to see with a specialized label. 180 181 case Op::splat: 182 write(o, "\t", V{id}, " [label = \"", V{id}, i.op, Splat{i.immy}, "\"]\n"); 183 break; 184 } 185 } 186 o->writeText("}\n"); 187 } 188 dump(SkWStream * o) const189 void Builder::dump(SkWStream* o) const { 190 SkDebugfStream debug; 191 if (!o) { o = &debug; } 192 193 std::vector<OptimizedInstruction> optimized = this->optimize(); 194 o->writeDecAsText(optimized.size()); 195 o->writeText(" values (originally "); 196 o->writeDecAsText(fProgram.size()); 197 o->writeText("):\n"); 198 for (Val id = 0; id < (Val)optimized.size(); id++) { 199 const OptimizedInstruction& inst = optimized[id]; 200 Op op = inst.op; 201 Val x = inst.x, 202 y = inst.y, 203 z = inst.z; 204 int immy = inst.immy, 205 immz = inst.immz; 206 write(o, !inst.can_hoist ? " " : 207 inst.used_in_loop ? "↑ " : 208 "↟ "); 209 switch (op) { 210 case Op::assert_true: write(o, op, V{x}, V{y}); break; 211 212 case Op::store8: write(o, op, Arg{immy}, V{x}); break; 213 case Op::store16: write(o, op, Arg{immy}, V{x}); break; 214 case Op::store32: write(o, op, Arg{immy}, V{x}); break; 215 216 case Op::index: write(o, V{id}, "=", op); break; 217 218 case Op::load8: write(o, V{id}, "=", op, Arg{immy}); break; 219 case Op::load16: write(o, V{id}, "=", op, Arg{immy}); break; 220 case Op::load32: write(o, V{id}, "=", op, Arg{immy}); break; 221 222 case Op::gather8: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break; 223 case Op::gather16: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break; 224 case Op::gather32: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break; 225 226 case Op::uniform8: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}); break; 227 case Op::uniform16: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}); break; 228 case Op::uniform32: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}); break; 229 230 case Op::splat: write(o, V{id}, "=", op, Splat{immy}); break; 231 232 233 case Op::add_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 234 case Op::sub_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 235 case Op::mul_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 236 case Op::div_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 237 case Op::min_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 238 case Op::max_f32: write(o, V{id}, "=", op, V{x}, V{y} ); break; 239 case Op::fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 240 case Op::fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 241 case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 242 243 244 case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}); break; 245 246 case Op::add_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break; 247 case Op::sub_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break; 248 case Op::mul_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break; 249 case Op::min_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break; 250 case Op::max_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break; 251 252 case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 253 case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 254 case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 255 case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}); break; 256 257 258 case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 259 case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 260 case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 261 262 case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}); break; 263 case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}); break; 264 case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}); break; 265 266 case Op:: eq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 267 case Op::neq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 268 case Op:: gt_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 269 case Op::gte_i32: write(o, V{id}, "=", op, V{x}, V{y}); break; 270 271 case Op::add_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break; 272 case Op::sub_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break; 273 case Op::mul_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break; 274 275 case Op::shl_i16x2: write(o, V{id}, "=", op, V{x}, Shift{immy}); break; 276 case Op::shr_i16x2: write(o, V{id}, "=", op, V{x}, Shift{immy}); break; 277 case Op::sra_i16x2: write(o, V{id}, "=", op, V{x}, Shift{immy}); break; 278 279 case Op:: eq_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break; 280 case Op::neq_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break; 281 case Op:: gt_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break; 282 case Op::gte_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break; 283 284 case Op::bit_and : write(o, V{id}, "=", op, V{x}, V{y} ); break; 285 case Op::bit_or : write(o, V{id}, "=", op, V{x}, V{y} ); break; 286 case Op::bit_xor : write(o, V{id}, "=", op, V{x}, V{y} ); break; 287 case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y} ); break; 288 289 case Op::bit_and_imm: write(o, V{id}, "=", op, V{x}, Hex{immy}); break; 290 case Op::bit_or_imm : write(o, V{id}, "=", op, V{x}, Hex{immy}); break; 291 case Op::bit_xor_imm: write(o, V{id}, "=", op, V{x}, Hex{immy}); break; 292 293 case Op::select: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break; 294 case Op::bytes: write(o, V{id}, "=", op, V{x}, Hex{immy}); break; 295 case Op::pack: write(o, V{id}, "=", op, V{x}, V{y}, Shift{immz}); break; 296 297 case Op::floor: write(o, V{id}, "=", op, V{x}); break; 298 case Op::to_f32: write(o, V{id}, "=", op, V{x}); break; 299 case Op::trunc: write(o, V{id}, "=", op, V{x}); break; 300 case Op::round: write(o, V{id}, "=", op, V{x}); break; 301 } 302 303 write(o, "\n"); 304 } 305 } 306 dump(SkWStream * o) const307 void Program::dump(SkWStream* o) const { 308 SkDebugfStream debug; 309 if (!o) { o = &debug; } 310 311 o->writeDecAsText(fImpl->regs); 312 o->writeText(" registers, "); 313 o->writeDecAsText(fImpl->instructions.size()); 314 o->writeText(" instructions:\n"); 315 for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) { 316 if (i == fImpl->loop) { write(o, "loop:\n"); } 317 o->writeDecAsText(i); 318 o->writeText("\t"); 319 if (i >= fImpl->loop) { write(o, " "); } 320 const InterpreterInstruction& inst = fImpl->instructions[i]; 321 Op op = inst.op; 322 Reg d = inst.d, 323 x = inst.x, 324 y = inst.y, 325 z = inst.z; 326 int immy = inst.immy, 327 immz = inst.immz; 328 switch (op) { 329 case Op::assert_true: write(o, op, R{x}, R{y}); break; 330 331 case Op::store8: write(o, op, Arg{immy}, R{x}); break; 332 case Op::store16: write(o, op, Arg{immy}, R{x}); break; 333 case Op::store32: write(o, op, Arg{immy}, R{x}); break; 334 335 case Op::index: write(o, R{d}, "=", op); break; 336 337 case Op::load8: write(o, R{d}, "=", op, Arg{immy}); break; 338 case Op::load16: write(o, R{d}, "=", op, Arg{immy}); break; 339 case Op::load32: write(o, R{d}, "=", op, Arg{immy}); break; 340 341 case Op::gather8: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break; 342 case Op::gather16: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break; 343 case Op::gather32: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break; 344 345 case Op::uniform8: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break; 346 case Op::uniform16: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break; 347 case Op::uniform32: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break; 348 349 case Op::splat: write(o, R{d}, "=", op, Splat{immy}); break; 350 351 352 case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 353 case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 354 case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 355 case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 356 case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 357 case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y} ); break; 358 case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 359 case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 360 case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 361 362 case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break; 363 364 case Op::add_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break; 365 case Op::sub_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break; 366 case Op::mul_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break; 367 case Op::min_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break; 368 case Op::max_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break; 369 370 case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 371 case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 372 case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 373 case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break; 374 375 376 case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 377 case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 378 case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 379 380 case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break; 381 case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break; 382 case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break; 383 384 case Op:: eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 385 case Op::neq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 386 case Op:: gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 387 case Op::gte_i32: write(o, R{d}, "=", op, R{x}, R{y}); break; 388 389 390 case Op::add_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break; 391 case Op::sub_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break; 392 case Op::mul_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break; 393 394 case Op::shl_i16x2: write(o, R{d}, "=", op, R{x}, Shift{immy}); break; 395 case Op::shr_i16x2: write(o, R{d}, "=", op, R{x}, Shift{immy}); break; 396 case Op::sra_i16x2: write(o, R{d}, "=", op, R{x}, Shift{immy}); break; 397 398 case Op:: eq_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break; 399 case Op::neq_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break; 400 case Op:: gt_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break; 401 case Op::gte_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break; 402 403 404 case Op::bit_and : write(o, R{d}, "=", op, R{x}, R{y} ); break; 405 case Op::bit_or : write(o, R{d}, "=", op, R{x}, R{y} ); break; 406 case Op::bit_xor : write(o, R{d}, "=", op, R{x}, R{y} ); break; 407 case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y} ); break; 408 409 case Op::bit_and_imm: write(o, R{d}, "=", op, R{x}, Hex{immy}); break; 410 case Op::bit_or_imm : write(o, R{d}, "=", op, R{x}, Hex{immy}); break; 411 case Op::bit_xor_imm: write(o, R{d}, "=", op, R{x}, Hex{immy}); break; 412 413 case Op::select: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break; 414 case Op::bytes: write(o, R{d}, "=", op, R{x}, Hex{immy}); break; 415 case Op::pack: write(o, R{d}, "=", op, R{x}, R{y}, Shift{immz}); break; 416 417 case Op::floor: write(o, R{d}, "=", op, R{x}); break; 418 case Op::to_f32: write(o, R{d}, "=", op, R{x}); break; 419 case Op::trunc: write(o, R{d}, "=", op, R{x}); break; 420 case Op::round: write(o, R{d}, "=", op, R{x}); break; 421 } 422 write(o, "\n"); 423 } 424 } 425 specialize_for_jit(std::vector<Instruction> * program)426 void specialize_for_jit(std::vector<Instruction>* program) { 427 Builder specialized; 428 for (Val i = 0; i < (Val)program->size(); i++) { 429 Instruction inst = (*program)[i]; 430 431 #if defined(SK_CPU_X86) 432 auto is_imm = [&](Val id, int* bits) { 433 *bits = (*program)[id].immy; 434 return (*program)[id].op == Op::splat; 435 }; 436 437 Op imm_op; 438 int bits; 439 switch (inst.op) { 440 default: break; 441 442 case Op::add_f32: imm_op = Op::add_f32_imm; goto try_imm_x_and_y; 443 case Op::mul_f32: imm_op = Op::mul_f32_imm; goto try_imm_x_and_y; 444 case Op::min_f32: imm_op = Op::min_f32_imm; goto try_imm_x_and_y; 445 case Op::max_f32: imm_op = Op::max_f32_imm; goto try_imm_x_and_y; 446 case Op::bit_and: imm_op = Op::bit_and_imm; goto try_imm_x_and_y; 447 case Op::bit_or: imm_op = Op::bit_or_imm ; goto try_imm_x_and_y; 448 case Op::bit_xor: imm_op = Op::bit_xor_imm; goto try_imm_x_and_y; 449 450 try_imm_x_and_y: 451 if (is_imm(inst.x, &bits)) { 452 inst.op = imm_op; 453 inst.x = inst.y; 454 inst.y = NA; 455 inst.immy = bits; 456 } else if (is_imm(inst.y, &bits)) { 457 inst.op = imm_op; 458 inst.y = NA; 459 inst.immy = bits; 460 } break; 461 462 case Op::sub_f32: 463 if (is_imm(inst.y, &bits)) { 464 inst.op = Op::sub_f32_imm; 465 inst.y = NA; 466 inst.immy = bits; 467 } break; 468 469 case Op::bit_clear: 470 if (is_imm(inst.y, &bits)) { 471 inst.op = Op::bit_and_imm; 472 inst.y = NA; 473 inst.immy = ~bits; 474 } break; 475 } 476 #endif 477 SkDEBUGCODE(Val id =) specialized.push(inst); 478 // If we replace single instructions with multiple, this will start breaking, 479 // and we'll need a table to remap them like we have in optimize(). 480 SkASSERT(id == i); 481 } 482 483 *program = specialized.program(); 484 } 485 optimize(bool for_jit) const486 std::vector<OptimizedInstruction> Builder::optimize(bool for_jit) const { 487 std::vector<Instruction> program = this->program(); 488 if (for_jit) { 489 specialize_for_jit(&program); 490 } 491 492 std::vector<bool> live_instructions; 493 std::vector<Val> frontier; 494 int liveInstructionCount = liveness_analysis(program, &live_instructions, &frontier); 495 skvm::Usage usage{program, live_instructions}; 496 497 std::vector<int> remaining_uses; 498 for (Val id = 0; id < (Val)program.size(); id++) { 499 remaining_uses.push_back((int)usage.users(id).size()); 500 } 501 502 // Map old Val index to rewritten index in optimized. 503 std::vector<Val> new_index(program.size(), NA); 504 505 auto pressure_change = [&](Val id) -> int { 506 int pressure = 0; 507 Instruction inst = program[id]; 508 509 // If this is not a sink, then it takes up a register 510 if (inst.op > Op::store32) { pressure += 1; } 511 512 // If this is the last use of the value, then that register will be free. 513 if (inst.x != NA && remaining_uses[inst.x] == 1) { pressure -= 1; } 514 if (inst.y != NA && remaining_uses[inst.y] == 1) { pressure -= 1; } 515 if (inst.z != NA && remaining_uses[inst.z] == 1) { pressure -= 1; } 516 return pressure; 517 }; 518 519 auto compare = [&](Val lhs, Val rhs) { 520 SkASSERT(lhs != rhs); 521 int lhs_change = pressure_change(lhs); 522 int rhs_change = pressure_change(rhs); 523 524 // This comparison operator orders instructions from least (likely negative) register 525 // pressure to most register pressure, breaking ties arbitrarily using original 526 // program order comparing the instruction index itself. 527 // 528 // We'll use this operator with std::{make,push,pop}_heap() to maintain a max heap 529 // frontier of instructions that are ready to schedule. We iterate backwards through 530 // the program, scheduling later instruction slots before earlier ones, and that means 531 // an instruction becomes ready to schedule once all instructions using its result have 532 // been scheduled (in later slots). 533 // 534 // All together that means we'll be issuing the instructions that hurt register pressure 535 // as late as possible, and issuing the instructions that help register pressure as soon 536 // as possible. 537 // 538 // This heuristic of greedily issuing the instruction that most immediately decreases 539 // register pressure approximates a more expensive search to find a schedule that 540 // minimizes the high-water maximum register pressure, the number of registers we'll 541 // need to run this program. 542 // 543 // The tie-breaker heuristic was found through experimentation. 544 return lhs_change < rhs_change || (lhs_change == rhs_change && lhs > rhs); 545 }; 546 547 // Order the instructions. 548 std::make_heap(frontier.begin(), frontier.end(), compare); 549 550 // Schedule the instructions last to first from the DAG. Produce a schedule that executes 551 // instructions that reduce register pressure before ones that increase register 552 // pressure. 553 std::vector<OptimizedInstruction> optimized; 554 optimized.resize(liveInstructionCount); 555 for (int i = liveInstructionCount; i-- > 0;) { 556 SkASSERT(!frontier.empty()); 557 std::pop_heap(frontier.begin(), frontier.end(), compare); 558 Val id = frontier.back(); 559 frontier.pop_back(); 560 new_index[id] = i; 561 Instruction inst = program[id]; 562 SkASSERT(remaining_uses[id] == 0); 563 564 // Use the old indices, and fix them up later. 565 optimized[i] = {inst.op, 566 inst.x, inst.y, inst.z, 567 inst.immy, inst.immz, 568 /*death=*/0, /*can_hoist=*/true, /*used_in_loop=*/false}; 569 570 auto maybe_issue = [&](Val input) { 571 if (input != NA) { 572 if (remaining_uses[input] == 1) { 573 frontier.push_back(input); 574 std::push_heap(frontier.begin(), frontier.end(), compare); 575 } 576 remaining_uses[input]--; 577 } 578 }; 579 maybe_issue(inst.x); 580 maybe_issue(inst.y); 581 maybe_issue(inst.z); 582 } 583 584 // Fix up the optimized program to use the optimized indices. 585 for (Val id = 0; id < (Val)optimized.size(); id++) { 586 OptimizedInstruction& inst = optimized[id]; 587 if (inst.x != NA ) { inst.x = new_index[inst.x]; } 588 if (inst.y != NA ) { inst.y = new_index[inst.y]; } 589 if (inst.z != NA ) { inst.z = new_index[inst.z]; } 590 } 591 592 SkASSERT(frontier.empty()); 593 594 // We're done with `program` now... everything below will analyze `optimized`. 595 596 // We'll want to know when it's safe to recycle registers holding the values 597 // produced by each instruction, that is, when no future instruction needs it. 598 for (Val id = 0; id < (Val)optimized.size(); id++) { 599 OptimizedInstruction& inst = optimized[id]; 600 // Stores don't really produce values. Just mark them as dying on issue. 601 if (inst.op <= Op::store32) { 602 inst.death = id; 603 } 604 // Extend the lifetime of this instruction's inputs to live until it issues. 605 // (We're walking in order, so this is the same as max()ing.) 606 if (inst.x != NA) { optimized[inst.x].death = id; } 607 if (inst.y != NA) { optimized[inst.y].death = id; } 608 if (inst.z != NA) { optimized[inst.z].death = id; } 609 } 610 611 // Mark which values don't depend on the loop and can be hoisted. 612 for (Val id = 0; id < (Val)optimized.size(); id++) { 613 OptimizedInstruction& inst = optimized[id]; 614 615 // Varying loads (and gathers) and stores cannot be hoisted out of the loop. 616 if (inst.op <= Op::gather32 && inst.op != Op::assert_true) { 617 inst.can_hoist = false; 618 } 619 620 // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself. 621 if (inst.can_hoist) { 622 if (inst.x != NA) { inst.can_hoist &= optimized[inst.x].can_hoist; } 623 if (inst.y != NA) { inst.can_hoist &= optimized[inst.y].can_hoist; } 624 if (inst.z != NA) { inst.can_hoist &= optimized[inst.z].can_hoist; } 625 } 626 627 // We'll want to know if hoisted values are used in the loop; 628 // if not, we can recycle their registers like we do loop values. 629 if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used_in_loop*/) { 630 if (inst.x != NA) { optimized[inst.x].used_in_loop = true; } 631 if (inst.y != NA) { optimized[inst.y].used_in_loop = true; } 632 if (inst.z != NA) { optimized[inst.z].used_in_loop = true; } 633 } 634 } 635 636 return optimized; 637 } 638 done(const char * debug_name) const639 Program Builder::done(const char* debug_name) const { 640 char buf[64] = "skvm-jit-"; 641 if (!debug_name) { 642 *SkStrAppendU32(buf+9, this->hash()) = '\0'; 643 debug_name = buf; 644 } 645 646 #if defined(SKVM_LLVM) || defined(SKVM_JIT) 647 return {this->optimize(false), this->optimize(true), fStrides, debug_name}; 648 #else 649 return {this->optimize(false), fStrides}; 650 #endif 651 } 652 hash() const653 uint64_t Builder::hash() const { 654 uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0), 655 hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1); 656 return (uint64_t)lo | (uint64_t)hi << 32; 657 } 658 operator ==(const Instruction & a,const Instruction & b)659 bool operator==(const Instruction& a, const Instruction& b) { 660 return a.op == b.op 661 && a.x == b.x 662 && a.y == b.y 663 && a.z == b.z 664 && a.immy == b.immy 665 && a.immz == b.immz; 666 } 667 operator ()(const Instruction & inst,uint32_t seed) const668 uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const { 669 return SkOpts::hash(&inst, sizeof(inst), seed); 670 } 671 672 673 // Most instructions produce a value and return it by ID, 674 // the value-producing instruction's own index in the program vector. push(Instruction inst)675 Val Builder::push(Instruction inst) { 676 // Basic common subexpression elimination: 677 // if we've already seen this exact Instruction, use it instead of creating a new one. 678 if (Val* id = fIndex.find(inst)) { 679 return *id; 680 } 681 Val id = static_cast<Val>(fProgram.size()); 682 fProgram.push_back(inst); 683 fIndex.set(inst, id); 684 return id; 685 } 686 allImm() const687 bool Builder::allImm() const { return true; } 688 689 template <typename T, typename... Rest> allImm(Val id,T * imm,Rest...rest) const690 bool Builder::allImm(Val id, T* imm, Rest... rest) const { 691 if (fProgram[id].op == Op::splat) { 692 static_assert(sizeof(T) == 4, ""); 693 memcpy(imm, &fProgram[id].immy, 4); 694 return this->allImm(rest...); 695 } 696 return false; 697 } 698 arg(int stride)699 Arg Builder::arg(int stride) { 700 int ix = (int)fStrides.size(); 701 fStrides.push_back(stride); 702 return {ix}; 703 } 704 assert_true(I32 cond,I32 debug)705 void Builder::assert_true(I32 cond, I32 debug) { 706 #ifdef SK_DEBUG 707 int imm; 708 if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; } 709 (void)push(Op::assert_true, cond.id,debug.id,NA); 710 #endif 711 } 712 store8(Arg ptr,I32 val)713 void Builder::store8 (Arg ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA, ptr.ix); } store16(Arg ptr,I32 val)714 void Builder::store16(Arg ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA, ptr.ix); } store32(Arg ptr,I32 val)715 void Builder::store32(Arg ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA, ptr.ix); } 716 index()717 I32 Builder::index() { return {this, push(Op::index , NA,NA,NA,0) }; } 718 load8(Arg ptr)719 I32 Builder::load8 (Arg ptr) { return {this, push(Op::load8 , NA,NA,NA, ptr.ix) }; } load16(Arg ptr)720 I32 Builder::load16(Arg ptr) { return {this, push(Op::load16, NA,NA,NA, ptr.ix) }; } load32(Arg ptr)721 I32 Builder::load32(Arg ptr) { return {this, push(Op::load32, NA,NA,NA, ptr.ix) }; } 722 gather8(Arg ptr,int offset,I32 index)723 I32 Builder::gather8 (Arg ptr, int offset, I32 index) { 724 return {this, push(Op::gather8 , index.id,NA,NA, ptr.ix,offset)}; 725 } gather16(Arg ptr,int offset,I32 index)726 I32 Builder::gather16(Arg ptr, int offset, I32 index) { 727 return {this, push(Op::gather16, index.id,NA,NA, ptr.ix,offset)}; 728 } gather32(Arg ptr,int offset,I32 index)729 I32 Builder::gather32(Arg ptr, int offset, I32 index) { 730 return {this, push(Op::gather32, index.id,NA,NA, ptr.ix,offset)}; 731 } 732 uniform8(Arg ptr,int offset)733 I32 Builder::uniform8(Arg ptr, int offset) { 734 return {this, push(Op::uniform8, NA,NA,NA, ptr.ix, offset)}; 735 } uniform16(Arg ptr,int offset)736 I32 Builder::uniform16(Arg ptr, int offset) { 737 return {this, push(Op::uniform16, NA,NA,NA, ptr.ix, offset)}; 738 } uniform32(Arg ptr,int offset)739 I32 Builder::uniform32(Arg ptr, int offset) { 740 return {this, push(Op::uniform32, NA,NA,NA, ptr.ix, offset)}; 741 } 742 743 // The two splat() functions are just syntax sugar over splatting a 4-byte bit pattern. splat(int n)744 I32 Builder::splat(int n) { return {this, push(Op::splat, NA,NA,NA, n) }; } splat(float f)745 F32 Builder::splat(float f) { 746 int bits; 747 memcpy(&bits, &f, 4); 748 return {this, push(Op::splat, NA,NA,NA, bits)}; 749 } 750 fma_supported()751 static bool fma_supported() { 752 static const bool supported = 753 #if defined(SK_CPU_X86) 754 SkCpu::Supports(SkCpu::HSW); 755 #elif defined(SK_CPU_ARM64) 756 true; 757 #else 758 false; 759 #endif 760 return supported; 761 } 762 763 // Be careful peepholing float math! Transformations you might expect to 764 // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0. 765 // Float peepholes must pass this equivalence test for all ~4B floats: 766 // 767 // bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); } 768 // 769 // unsigned bits = 0; 770 // do { 771 // float f; 772 // memcpy(&f, &bits, 4); 773 // if (!equiv(f, ...)) { 774 // abort(); 775 // } 776 // } while (++bits != 0); 777 add(F32 x,F32 y)778 F32 Builder::add(F32 x, F32 y) { 779 float X,Y; 780 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X+Y); } 781 if (this->isImm(y.id, 0.0f)) { return x; } // x+0 == x 782 if (this->isImm(x.id, 0.0f)) { return y; } // 0+y == y 783 784 if (fma_supported()) { 785 if (fProgram[x.id].op == Op::mul_f32) { 786 return {this, push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)}; 787 } 788 if (fProgram[y.id].op == Op::mul_f32) { 789 return {this, push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)}; 790 } 791 } 792 return {this, push(Op::add_f32, x.id, y.id)}; 793 } 794 sub(F32 x,F32 y)795 F32 Builder::sub(F32 x, F32 y) { 796 float X,Y; 797 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X-Y); } 798 if (this->isImm(y.id, 0.0f)) { return x; } // x-0 == x 799 if (fma_supported()) { 800 if (fProgram[x.id].op == Op::mul_f32) { 801 return {this, push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)}; 802 } 803 if (fProgram[y.id].op == Op::mul_f32) { 804 return {this, push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)}; 805 } 806 } 807 return {this, push(Op::sub_f32, x.id, y.id)}; 808 } 809 mul(F32 x,F32 y)810 F32 Builder::mul(F32 x, F32 y) { 811 float X,Y; 812 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X*Y); } 813 if (this->isImm(y.id, 1.0f)) { return x; } // x*1 == x 814 if (this->isImm(x.id, 1.0f)) { return y; } // 1*y == y 815 return {this, push(Op::mul_f32, x.id, y.id)}; 816 } 817 div(F32 x,F32 y)818 F32 Builder::div(F32 x, F32 y) { 819 float X,Y; 820 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X/Y); } 821 if (this->isImm(y.id, 1.0f)) { return x; } // x/1 == x 822 return {this, push(Op::div_f32, x.id, y.id)}; 823 } 824 sqrt(F32 x)825 F32 Builder::sqrt(F32 x) { 826 float X; 827 if (this->allImm(x.id,&X)) { return this->splat(std::sqrt(X)); } 828 return {this, push(Op::sqrt_f32, x.id,NA,NA)}; 829 } 830 831 // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html. approx_log2(F32 x)832 F32 Builder::approx_log2(F32 x) { 833 // e - 127 is a fair approximation of log2(x) in its own right... 834 F32 e = mul(to_f32(bit_cast(x)), splat(1.0f / (1<<23))); 835 836 // ... but using the mantissa to refine its error is _much_ better. 837 F32 m = bit_cast(bit_or(bit_and(bit_cast(x), 0x007fffff), 838 0x3f000000)); 839 F32 approx = sub(e, 124.225514990f); 840 approx = sub(approx, mul(1.498030302f, m)); 841 approx = sub(approx, div(1.725879990f, add(0.3520887068f, m))); 842 843 return approx; 844 } 845 approx_pow2(F32 x)846 F32 Builder::approx_pow2(F32 x) { 847 F32 f = fract(x); 848 F32 approx = add(x, 121.274057500f); 849 approx = sub(approx, mul( 1.490129070f, f)); 850 approx = add(approx, div(27.728023300f, sub(4.84252568f, f))); 851 852 return bit_cast(round(mul(1.0f * (1<<23), approx))); 853 } 854 approx_powf(F32 x,F32 y)855 F32 Builder::approx_powf(F32 x, F32 y) { 856 auto is_x = bit_or(eq(x, 0.0f), 857 eq(x, 1.0f)); 858 return select(is_x, x, approx_pow2(mul(approx_log2(x), y))); 859 } 860 min(F32 x,F32 y)861 F32 Builder::min(F32 x, F32 y) { 862 float X,Y; 863 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(std::min(X,Y)); } 864 return {this, push(Op::min_f32, x.id, y.id)}; 865 } max(F32 x,F32 y)866 F32 Builder::max(F32 x, F32 y) { 867 float X,Y; 868 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(std::max(X,Y)); } 869 return {this, push(Op::max_f32, x.id, y.id)}; 870 } 871 add(I32 x,I32 y)872 I32 Builder::add(I32 x, I32 y) { return {this, push(Op::add_i32, x.id, y.id)}; } sub(I32 x,I32 y)873 I32 Builder::sub(I32 x, I32 y) { return {this, push(Op::sub_i32, x.id, y.id)}; } mul(I32 x,I32 y)874 I32 Builder::mul(I32 x, I32 y) { return {this, push(Op::mul_i32, x.id, y.id)}; } 875 add_16x2(I32 x,I32 y)876 I32 Builder::add_16x2(I32 x, I32 y) { return {this, push(Op::add_i16x2, x.id, y.id)}; } sub_16x2(I32 x,I32 y)877 I32 Builder::sub_16x2(I32 x, I32 y) { return {this, push(Op::sub_i16x2, x.id, y.id)}; } mul_16x2(I32 x,I32 y)878 I32 Builder::mul_16x2(I32 x, I32 y) { return {this, push(Op::mul_i16x2, x.id, y.id)}; } 879 shl(I32 x,int bits)880 I32 Builder::shl(I32 x, int bits) { 881 if (bits == 0) { return x; } 882 int X; 883 if (this->allImm(x.id,&X)) { return this->splat(X << bits); } 884 return {this, push(Op::shl_i32, x.id,NA,NA, bits)}; 885 } shr(I32 x,int bits)886 I32 Builder::shr(I32 x, int bits) { 887 if (bits == 0) { return x; } 888 int X; 889 if (this->allImm(x.id,&X)) { return this->splat(unsigned(X) >> bits); } 890 return {this, push(Op::shr_i32, x.id,NA,NA, bits)}; 891 } sra(I32 x,int bits)892 I32 Builder::sra(I32 x, int bits) { 893 if (bits == 0) { return x; } 894 int X; 895 if (this->allImm(x.id,&X)) { return this->splat(X >> bits); } 896 return {this, push(Op::sra_i32, x.id,NA,NA, bits)}; 897 } 898 shl_16x2(I32 x,int k)899 I32 Builder::shl_16x2(I32 x, int k) { return {this, push(Op::shl_i16x2, x.id,NA,NA, k)}; } shr_16x2(I32 x,int k)900 I32 Builder::shr_16x2(I32 x, int k) { return {this, push(Op::shr_i16x2, x.id,NA,NA, k)}; } sra_16x2(I32 x,int k)901 I32 Builder::sra_16x2(I32 x, int k) { return {this, push(Op::sra_i16x2, x.id,NA,NA, k)}; } 902 eq(F32 x,F32 y)903 I32 Builder:: eq(F32 x, F32 y) { 904 float X,Y; 905 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X==Y ? ~0 : 0); } 906 return {this, push(Op::eq_f32, x.id, y.id)}; 907 } neq(F32 x,F32 y)908 I32 Builder::neq(F32 x, F32 y) { 909 float X,Y; 910 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X!=Y ? ~0 : 0); } 911 return {this, push(Op::neq_f32, x.id, y.id)}; 912 } lt(F32 x,F32 y)913 I32 Builder::lt(F32 x, F32 y) { 914 float X,Y; 915 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(Y> X ? ~0 : 0); } 916 return {this, push(Op::gt_f32, y.id, x.id)}; 917 } lte(F32 x,F32 y)918 I32 Builder::lte(F32 x, F32 y) { 919 float X,Y; 920 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(Y>=X ? ~0 : 0); } 921 return {this, push(Op::gte_f32, y.id, x.id)}; 922 } gt(F32 x,F32 y)923 I32 Builder::gt(F32 x, F32 y) { 924 float X,Y; 925 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X> Y ? ~0 : 0); } 926 return {this, push(Op::gt_f32, x.id, y.id)}; 927 } gte(F32 x,F32 y)928 I32 Builder::gte(F32 x, F32 y) { 929 float X,Y; 930 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X>=Y ? ~0 : 0); } 931 return {this, push(Op::gte_f32, x.id, y.id)}; 932 } 933 eq(I32 x,I32 y)934 I32 Builder:: eq(I32 x, I32 y) { return {this, push(Op:: eq_i32, x.id, y.id)}; } neq(I32 x,I32 y)935 I32 Builder::neq(I32 x, I32 y) { return {this, push(Op::neq_i32, x.id, y.id)}; } lt(I32 x,I32 y)936 I32 Builder:: lt(I32 x, I32 y) { return {this, push(Op:: gt_i32, y.id, x.id)}; } lte(I32 x,I32 y)937 I32 Builder::lte(I32 x, I32 y) { return {this, push(Op::gte_i32, y.id, x.id)}; } gt(I32 x,I32 y)938 I32 Builder:: gt(I32 x, I32 y) { return {this, push(Op:: gt_i32, x.id, y.id)}; } gte(I32 x,I32 y)939 I32 Builder::gte(I32 x, I32 y) { return {this, push(Op::gte_i32, x.id, y.id)}; } 940 eq_16x2(I32 x,I32 y)941 I32 Builder:: eq_16x2(I32 x, I32 y) { return {this, push(Op:: eq_i16x2, x.id, y.id)}; } neq_16x2(I32 x,I32 y)942 I32 Builder::neq_16x2(I32 x, I32 y) { return {this, push(Op::neq_i16x2, x.id, y.id)}; } lt_16x2(I32 x,I32 y)943 I32 Builder:: lt_16x2(I32 x, I32 y) { return {this, push(Op:: gt_i16x2, y.id, x.id)}; } lte_16x2(I32 x,I32 y)944 I32 Builder::lte_16x2(I32 x, I32 y) { return {this, push(Op::gte_i16x2, y.id, x.id)}; } gt_16x2(I32 x,I32 y)945 I32 Builder:: gt_16x2(I32 x, I32 y) { return {this, push(Op:: gt_i16x2, x.id, y.id)}; } gte_16x2(I32 x,I32 y)946 I32 Builder::gte_16x2(I32 x, I32 y) { return {this, push(Op::gte_i16x2, x.id, y.id)}; } 947 bit_and(I32 x,I32 y)948 I32 Builder::bit_and(I32 x, I32 y) { 949 int X,Y; 950 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X&Y); } 951 if (this->isImm(y.id, 0)) { return this->splat(0); } // (x & false) == false 952 if (this->isImm(x.id, 0)) { return this->splat(0); } // (false & y) == false 953 if (this->isImm(y.id,~0)) { return x; } // (x & true) == x 954 if (this->isImm(x.id,~0)) { return y; } // (true & y) == y 955 return {this, push(Op::bit_and, x.id, y.id)}; 956 } bit_or(I32 x,I32 y)957 I32 Builder::bit_or(I32 x, I32 y) { 958 int X,Y; 959 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X|Y); } 960 if (this->isImm(y.id, 0)) { return x; } // (x | false) == x 961 if (this->isImm(x.id, 0)) { return y; } // (false | y) == y 962 if (this->isImm(y.id,~0)) { return this->splat(~0); } // (x | true) == true 963 if (this->isImm(x.id,~0)) { return this->splat(~0); } // (true | y) == true 964 return {this, push(Op::bit_or, x.id, y.id)}; 965 } bit_xor(I32 x,I32 y)966 I32 Builder::bit_xor(I32 x, I32 y) { 967 int X,Y; 968 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X^Y); } 969 if (this->isImm(y.id, 0)) { return x; } // (x ^ false) == x 970 if (this->isImm(x.id, 0)) { return y; } // (false ^ y) == y 971 return {this, push(Op::bit_xor, x.id, y.id)}; 972 } bit_clear(I32 x,I32 y)973 I32 Builder::bit_clear(I32 x, I32 y) { 974 int X,Y; 975 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X&~Y); } 976 if (this->isImm(y.id, 0)) { return x; } // (x & ~false) == x 977 if (this->isImm(y.id,~0)) { return this->splat(0); } // (x & ~true) == false 978 if (this->isImm(x.id, 0)) { return this->splat(0); } // (false & ~y) == false 979 return {this, push(Op::bit_clear, x.id, y.id)}; 980 } 981 select(I32 x,I32 y,I32 z)982 I32 Builder::select(I32 x, I32 y, I32 z) { 983 int X,Y,Z; 984 if (this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return this->splat(X?Y:Z); } 985 // TODO: some cases to reduce to bit_and when y == 0 or z == 0? 986 return {this, push(Op::select, x.id, y.id, z.id)}; 987 } 988 extract(I32 x,int bits,I32 z)989 I32 Builder::extract(I32 x, int bits, I32 z) { 990 int Z; 991 if (this->allImm(z.id,&Z) && (~0u>>bits) == (unsigned)Z) { return this->shr(x, bits); } 992 return this->bit_and(z, this->shr(x, bits)); 993 } 994 pack(I32 x,I32 y,int bits)995 I32 Builder::pack(I32 x, I32 y, int bits) { 996 int X,Y; 997 if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X|(Y<<bits)); } 998 return {this, push(Op::pack, x.id,y.id,NA, 0,bits)}; 999 } 1000 bytes(I32 x,int control)1001 I32 Builder::bytes(I32 x, int control) { 1002 return {this, push(Op::bytes, x.id,NA,NA, control)}; 1003 } 1004 floor(F32 x)1005 F32 Builder::floor(F32 x) { 1006 float X; 1007 if (this->allImm(x.id,&X)) { return this->splat(floorf(X)); } 1008 return {this, push(Op::floor, x.id)}; 1009 } to_f32(I32 x)1010 F32 Builder::to_f32(I32 x) { 1011 int X; 1012 if (this->allImm(x.id,&X)) { return this->splat((float)X); } 1013 return {this, push(Op::to_f32, x.id)}; 1014 } trunc(F32 x)1015 I32 Builder::trunc(F32 x) { 1016 float X; 1017 if (this->allImm(x.id,&X)) { return this->splat((int)X); } 1018 return {this, push(Op::trunc, x.id)}; 1019 } round(F32 x)1020 I32 Builder::round(F32 x) { 1021 float X; 1022 if (this->allImm(x.id,&X)) { return this->splat((int)lrintf(X)); } 1023 return {this, push(Op::round, x.id)}; 1024 } 1025 from_unorm(int bits,I32 x)1026 F32 Builder::from_unorm(int bits, I32 x) { 1027 F32 limit = splat(1 / ((1<<bits)-1.0f)); 1028 return mul(to_f32(x), limit); 1029 } to_unorm(int bits,F32 x)1030 I32 Builder::to_unorm(int bits, F32 x) { 1031 F32 limit = splat((1<<bits)-1.0f); 1032 return round(mul(x, limit)); 1033 } 1034 unpack_1010102(I32 rgba)1035 Color Builder::unpack_1010102(I32 rgba) { 1036 return { 1037 from_unorm(10, extract(rgba, 0, 0x3ff)), 1038 from_unorm(10, extract(rgba, 10, 0x3ff)), 1039 from_unorm(10, extract(rgba, 20, 0x3ff)), 1040 from_unorm( 2, extract(rgba, 30, 0x3 )), 1041 }; 1042 } unpack_8888(I32 rgba)1043 Color Builder::unpack_8888(I32 rgba) { 1044 return { 1045 from_unorm(8, extract(rgba, 0, 0xff)), 1046 from_unorm(8, extract(rgba, 8, 0xff)), 1047 from_unorm(8, extract(rgba, 16, 0xff)), 1048 from_unorm(8, extract(rgba, 24, 0xff)), 1049 }; 1050 } unpack_565(I32 bgr)1051 Color Builder::unpack_565(I32 bgr) { 1052 return { 1053 from_unorm(5, extract(bgr, 11, 0b011'111)), 1054 from_unorm(6, extract(bgr, 5, 0b111'111)), 1055 from_unorm(5, extract(bgr, 0, 0b011'111)), 1056 splat(1.0f), 1057 }; 1058 } 1059 unpremul(F32 * r,F32 * g,F32 * b,F32 a)1060 void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) { 1061 skvm::F32 invA = div(1.0f, a), 1062 inf = bit_cast(splat(0x7f800000)); 1063 // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0). 1064 invA = bit_cast(bit_and(lt(invA, inf), 1065 bit_cast(invA))); 1066 *r = mul(*r, invA); 1067 *g = mul(*g, invA); 1068 *b = mul(*b, invA); 1069 } 1070 premul(F32 * r,F32 * g,F32 * b,F32 a)1071 void Builder::premul(F32* r, F32* g, F32* b, F32 a) { 1072 *r = mul(*r, a); 1073 *g = mul(*g, a); 1074 *b = mul(*b, a); 1075 } 1076 uniformPremul(SkColor4f color,SkColorSpace * src,Uniforms * uniforms,SkColorSpace * dst)1077 Color Builder::uniformPremul(SkColor4f color, SkColorSpace* src, 1078 Uniforms* uniforms, SkColorSpace* dst) { 1079 SkColorSpaceXformSteps(src, kUnpremul_SkAlphaType, 1080 dst, kPremul_SkAlphaType).apply(color.vec()); 1081 return { 1082 uniformF(uniforms->pushF(color.fR)), 1083 uniformF(uniforms->pushF(color.fG)), 1084 uniformF(uniforms->pushF(color.fB)), 1085 uniformF(uniforms->pushF(color.fA)), 1086 }; 1087 } 1088 lerp(Color lo,Color hi,F32 t)1089 Color Builder::lerp(Color lo, Color hi, F32 t) { 1090 return { 1091 lerp(lo.r, hi.r, t), 1092 lerp(lo.g, hi.g, t), 1093 lerp(lo.b, hi.b, t), 1094 lerp(lo.a, hi.a, t), 1095 }; 1096 } 1097 to_hsla(Color c)1098 HSLA Builder::to_hsla(Color c) { 1099 F32 mx = max(max(c.r,c.g),c.b), 1100 mn = min(min(c.r,c.g),c.b), 1101 d = mx - mn, 1102 g_lt_b = select(c.g < c.b, splat(6.0f) 1103 , splat(0.0f)); 1104 1105 auto diffm = [&](auto a, auto b) { 1106 return (a - b) * (1 / d); 1107 }; 1108 1109 F32 h = mul(1/6.0f, 1110 select(eq(mx, mn), 0.0f, 1111 select(eq(mx, c.r), add(diffm(c.g,c.b), g_lt_b), 1112 select(eq(mx, c.g), add(diffm(c.b,c.r), 2.0f) 1113 , add(diffm(c.r,c.g), 4.0f))))); 1114 1115 F32 sum = add(mx,mn); 1116 F32 l = mul(sum, 0.5f); 1117 F32 s = select(eq(mx,mn), 0.0f 1118 , div(d, select(gt(l,0.5f), sub(2.0f,sum) 1119 , sum))); 1120 return {h, s, l, c.a}; 1121 } 1122 to_rgba(HSLA c)1123 Color Builder::to_rgba(HSLA c) { 1124 // See GrRGBToHSLFilterEffect.fp 1125 1126 auto h = c.h; 1127 auto s = c.s; 1128 auto l = c.l; 1129 F32 x = mul(sub(1.0f, abs(sub(add(l,l), 1.0f))), s); 1130 1131 auto hue_to_rgb = [&](auto hue) { 1132 auto q = sub(abs(mad(fract(hue), splat(6.0f), splat(-3.0f))), splat(1.0f)); 1133 return mad(sub(clamp01(q), splat(0.5f)), x, l); 1134 }; 1135 1136 return { 1137 hue_to_rgb(add(h, 0/3.0f)), 1138 hue_to_rgb(add(h, 2/3.0f)), 1139 hue_to_rgb(add(h, 1/3.0f)), 1140 c.a, 1141 }; 1142 } 1143 1144 // We're basing our implementation of non-separable blend modes on 1145 // https://www.w3.org/TR/compositing-1/#blendingnonseparable. 1146 // and 1147 // https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf 1148 // They're equivalent, but ES' math has been better simplified. 1149 // 1150 // Anything extra we add beyond that is to make the math work with premul inputs. 1151 saturation(skvm::Builder * p,skvm::F32 r,skvm::F32 g,skvm::F32 b)1152 static skvm::F32 saturation(skvm::Builder* p, skvm::F32 r, skvm::F32 g, skvm::F32 b) { 1153 return max(r, max(g, b)) 1154 - min(r, min(g, b)); 1155 } 1156 luminance(skvm::Builder * p,skvm::F32 r,skvm::F32 g,skvm::F32 b)1157 static skvm::F32 luminance(skvm::Builder* p, skvm::F32 r, skvm::F32 g, skvm::F32 b) { 1158 return r*0.30f + (g*0.59f + b*0.11f); 1159 } 1160 set_sat(skvm::Builder * p,skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 s)1161 static void set_sat(skvm::Builder* p, skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) { 1162 F32 mn = min(*r, min(*g, *b)), 1163 mx = max(*r, max(*g, *b)), 1164 sat = mx - mn; 1165 1166 // Map min channel to 0, max channel to s, and scale the middle proportionally. 1167 auto scale = [&](auto c) { 1168 // TODO: better to divide and check for non-finite result? 1169 return select(sat == 0.0f, 0.0f 1170 , ((c - mn) * s) / sat); 1171 }; 1172 *r = scale(*r); 1173 *g = scale(*g); 1174 *b = scale(*b); 1175 } 1176 set_lum(skvm::Builder * p,skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 lu)1177 static void set_lum(skvm::Builder* p, skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) { 1178 auto diff = lu - luminance(p, *r, *g, *b); 1179 *r += diff; 1180 *g += diff; 1181 *b += diff; 1182 } 1183 clip_color(skvm::Builder * p,skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 a)1184 static void clip_color(skvm::Builder* p, 1185 skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) { 1186 F32 mn = min(*r, min(*g, *b)), 1187 mx = max(*r, max(*g, *b)), 1188 lu = luminance(p, *r, *g, *b); 1189 1190 auto clip = [&](auto c) { 1191 c = select(mn >= 0, c 1192 , lu + ((c-lu)*( lu)) / (lu-mn)); 1193 c = select(mx > a, lu + ((c-lu)*(a-lu)) / (mx-lu) 1194 , c); 1195 // Sometimes without this we may dip just a little negative. 1196 return max(c, 0.0f); 1197 }; 1198 *r = clip(*r); 1199 *g = clip(*g); 1200 *b = clip(*b); 1201 } 1202 blend(SkBlendMode mode,Color src,Color dst)1203 Color Builder::blend(SkBlendMode mode, Color src, Color dst) { 1204 auto mma = [this](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) { 1205 return mad(x,y, mul(z,w)); 1206 }; 1207 1208 auto two = [this](skvm::F32 x) { return add(x, x); }; 1209 1210 auto apply_rgba = [&](auto&& fn) { 1211 return Color { 1212 fn(src.r, dst.r), 1213 fn(src.g, dst.g), 1214 fn(src.b, dst.b), 1215 fn(src.a, dst.a), 1216 }; 1217 }; 1218 1219 auto apply_rgb_srcover_a = [&](auto&& fn) { 1220 return Color { 1221 fn(src.r, dst.r), 1222 fn(src.g, dst.g), 1223 fn(src.b, dst.b), 1224 mad(dst.a, 1-src.a, src.a), // srcover for alpha 1225 }; 1226 }; 1227 1228 auto non_sep = [&](auto R, auto G, auto B) { 1229 return Color{ 1230 add(mma(src.r, 1-dst.a, dst.r, 1-src.a), R), 1231 add(mma(src.g, 1-dst.a, dst.g, 1-src.a), G), 1232 add(mma(src.b, 1-dst.a, dst.b, 1-src.a), B), 1233 mad(dst.a,1-src.a, src.a), // srcover 1234 }; 1235 }; 1236 1237 switch (mode) { 1238 default: SkASSERT(false); /*but also, for safety, fallthrough*/ 1239 1240 case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) }; 1241 1242 case SkBlendMode::kSrc: return src; 1243 case SkBlendMode::kDst: return dst; 1244 1245 case SkBlendMode::kDstOver: std::swap(src, dst); // fall-through 1246 case SkBlendMode::kSrcOver: 1247 return apply_rgba([&](auto s, auto d) { 1248 return this->mad(d,1-src.a, s); 1249 }); 1250 1251 case SkBlendMode::kDstIn: std::swap(src, dst); // fall-through 1252 case SkBlendMode::kSrcIn: 1253 return apply_rgba([&](auto s, auto d) { 1254 return this->mul(s, dst.a); 1255 }); 1256 1257 case SkBlendMode::kDstOut: std::swap(src, dst); // fall-through 1258 case SkBlendMode::kSrcOut: 1259 return apply_rgba([&](auto s, auto d) { 1260 return this->mul(s, 1-dst.a); 1261 }); 1262 1263 case SkBlendMode::kDstATop: std::swap(src, dst); // fall-through 1264 case SkBlendMode::kSrcATop: 1265 return apply_rgba([&](auto s, auto d) { 1266 return mma(s, dst.a, d, 1-src.a); 1267 }); 1268 1269 case SkBlendMode::kXor: 1270 return apply_rgba([&](auto s, auto d) { 1271 return mma(s, 1-dst.a, d, 1-src.a); 1272 }); 1273 1274 case SkBlendMode::kPlus: 1275 return apply_rgba([&](auto s, auto d) { 1276 return this->min(add(s, d), 1.0f); 1277 }); 1278 1279 case SkBlendMode::kModulate: 1280 return apply_rgba([&](auto s, auto d) { 1281 return this->mul(s, d); 1282 }); 1283 1284 case SkBlendMode::kScreen: 1285 // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts. 1286 // It's kind of plausible that s + (d - sd) keeps more precision? 1287 return apply_rgba([&](auto s, auto d) { 1288 return this->add(s, this->sub(d, this->mul(s, d))); 1289 }); 1290 1291 case SkBlendMode::kDarken: 1292 return apply_rgb_srcover_a([&](auto s, auto d) { 1293 return this->add(s, this->sub(d, this->max(this->mul(s, dst.a), 1294 this->mul(d, src.a)))); 1295 }); 1296 1297 case SkBlendMode::kLighten: 1298 return apply_rgb_srcover_a([&](auto s, auto d) { 1299 return this->add(s, this->sub(d, this->min(this->mul(s, dst.a), 1300 this->mul(d, src.a)))); 1301 }); 1302 1303 case SkBlendMode::kDifference: 1304 return apply_rgb_srcover_a([&](auto s, auto d) { 1305 return this->add(s, this->sub(d, two(this->min(this->mul(s, dst.a), 1306 this->mul(d, src.a))))); 1307 }); 1308 1309 case SkBlendMode::kExclusion: 1310 return apply_rgb_srcover_a([&](auto s, auto d) { 1311 return this->add(s, this->sub(d, two(this->mul(s, d)))); 1312 }); 1313 1314 case SkBlendMode::kColorBurn: 1315 return apply_rgb_srcover_a([&](auto s, auto d) { 1316 // TODO: divide and check for non-finite result instead of checking for s == 0. 1317 auto mn = this->min(dst.a, 1318 this->div(this->mul(this->sub(dst.a, d), src.a), s)), 1319 burn = this->mad(src.a, this->sub(dst.a, mn), mma(s, 1-dst.a, d, 1-src.a)); 1320 return select(eq(d, dst.a), this->mad(s, 1-dst.a, d), 1321 select(eq(s, 0.0f), this->mul(d, 1-src.a) 1322 , burn)); 1323 }); 1324 1325 case SkBlendMode::kColorDodge: 1326 return apply_rgb_srcover_a([&](auto s, auto d) { 1327 // TODO: divide and check for non-finite result instead of checking for s == sa. 1328 auto dodge = this->mad(src.a, this->min(dst.a, 1329 this->div(this->mul(d, src.a), this->sub(src.a, s))), 1330 mma(s, 1-dst.a, d, 1-src.a)); 1331 return select(eq(d, 0.0f), mul(s, 1-dst.a), 1332 select(eq(s, src.a), mad(d, 1-src.a, s) 1333 , dodge)); 1334 }); 1335 1336 case SkBlendMode::kHardLight: 1337 return apply_rgb_srcover_a([&](auto s, auto d) { 1338 return add(mma(s, 1-dst.a, d, 1-src.a), 1339 select(lte(two(s), src.a), 1340 two(mul(s, d)), 1341 sub(mul(src.a, dst.a), two(mul(sub(dst.a, d), sub(src.a, s)))))); 1342 }); 1343 1344 case SkBlendMode::kOverlay: 1345 return apply_rgb_srcover_a([&](auto s, auto d) { 1346 return add(mma(s, 1-dst.a, d, 1-src.a), 1347 select(lte(two(d), dst.a), 1348 two(mul(s, d)), 1349 sub(mul(src.a, dst.a), two(mul(sub(dst.a, d), sub(src.a, s)))))); 1350 }); 1351 1352 case SkBlendMode::kMultiply: 1353 return apply_rgba([&](auto s, auto d) { 1354 return this->add(mma(s, 1-dst.a, d, 1-src.a), this->mul(s, d)); 1355 }); 1356 1357 case SkBlendMode::kSoftLight: 1358 return apply_rgb_srcover_a([&](auto s, auto d) { 1359 auto m = select(gt(dst.a, 0.0f), div(d, dst.a), 0.0f), 1360 s2 = two(s), 1361 m4 = two(two(m)); 1362 1363 // The logic forks three ways: 1364 // 1. dark src? 1365 // 2. light src, dark dst? 1366 // 3. light src, light dst? 1367 1368 // Used in case 1 1369 auto darkSrc = mul(d, mad(sub(s2, src.a), 1-m, src.a)), 1370 // Used in case 2 1371 darkDst = mad(mad(m4, m4, m4), sub(m, 1.0f), mul(7.0f, m)), 1372 // Used in case 3. 1373 liteDst = sub(sqrt(m), m), 1374 // Used in 2 or 3? 1375 liteSrc = mad(mul(dst.a, sub(s2, src.a)), 1376 select(lte(two(two(d)), dst.a), darkDst, liteDst), 1377 mul(d, src.a)); 1378 return mad(s, 1-dst.a, mad(d, 1379 1-src.a, 1380 select(lte(s2, src.a), darkSrc, liteSrc))); 1381 1382 1383 }); 1384 1385 case SkBlendMode::kHue: { 1386 skvm::F32 R = mul(src.r, src.a), 1387 G = mul(src.g, src.a), 1388 B = mul(src.b, src.a); 1389 1390 set_sat(this, &R, &G, &B, mul(saturation(this, dst.r, dst.g, dst.b), src.a)); 1391 set_lum(this, &R, &G, &B, mul( luminance(this, dst.r, dst.g, dst.b), src.a)); 1392 clip_color(this, &R, &G, &B, mul(src.a, dst.a)); 1393 1394 return non_sep(R, G, B); 1395 } 1396 1397 case SkBlendMode::kSaturation: { 1398 skvm::F32 R = mul(dst.r, src.a), 1399 G = mul(dst.g, src.a), 1400 B = mul(dst.b, src.a); 1401 1402 set_sat(this, &R, &G, &B, mul(saturation(this, src.r, src.g, src.b), dst.a)); 1403 set_lum(this, &R, &G, &B, mul( luminance(this, dst.r, dst.g, dst.b), src.a)); 1404 clip_color(this, &R, &G, &B, mul(src.a, dst.a)); 1405 1406 return non_sep(R, G, B); 1407 } 1408 1409 case SkBlendMode::kColor: { 1410 skvm::F32 R = mul(src.r, dst.a), 1411 G = mul(src.g, dst.a), 1412 B = mul(src.b, dst.a); 1413 1414 set_lum(this, &R, &G, &B, mul(luminance(this, dst.r, dst.g, dst.b), src.a)); 1415 clip_color(this, &R, &G, &B, mul(src.a, dst.a)); 1416 1417 return non_sep(R, G, B); 1418 } 1419 1420 case SkBlendMode::kLuminosity: { 1421 skvm::F32 R = mul(dst.r, src.a), 1422 G = mul(dst.g, src.a), 1423 B = mul(dst.b, src.a); 1424 1425 set_lum(this, &R, &G, &B, mul(luminance(this, src.r, src.g, src.b), dst.a)); 1426 clip_color(this, &R, &G, &B, mul(src.a, dst.a)); 1427 1428 return non_sep(R, G, B); 1429 } 1430 } 1431 } 1432 1433 // Fill live and sinks each if non-null: 1434 // - (*live)[id]: notes whether each input instruction is live 1435 // - *sinks: an unsorted set of live instructions with side effects (stores, assert_true) 1436 // Returns the number of live instructions. liveness_analysis(const std::vector<Instruction> & instructions,std::vector<bool> * live,std::vector<Val> * sinks)1437 int liveness_analysis(const std::vector<Instruction>& instructions, 1438 std::vector<bool>* live, 1439 std::vector<Val>* sinks) { 1440 int instruction_count = instructions.size(); 1441 live->resize(instruction_count, false); 1442 int liveInstructionCount = 0; 1443 auto trace = [&](Val id, auto& recurse) -> void { 1444 if (!(*live)[id]) { 1445 (*live)[id] = true; 1446 liveInstructionCount++; 1447 Instruction inst = instructions[id]; 1448 if (inst.x != NA) { recurse(inst.x, recurse); } 1449 if (inst.y != NA) { recurse(inst.y, recurse); } 1450 if (inst.z != NA) { recurse(inst.z, recurse); } 1451 } 1452 }; 1453 1454 // For all the sink instructions. 1455 for (Val id = 0; id < instruction_count; id++) { 1456 if (instructions[id].op <= skvm::Op::store32) { 1457 sinks->push_back(id); 1458 trace(id, trace); 1459 } 1460 } 1461 return liveInstructionCount; 1462 } 1463 1464 // For a given program we'll store each Instruction's users contiguously in a table, 1465 // and track where each Instruction's span of users starts and ends in another index. 1466 // Here's a simple program that loads x and stores kx+k: 1467 // 1468 // v0 = splat(k) 1469 // v1 = load(...) 1470 // v2 = mul(v1, v0) 1471 // v3 = add(v2, v0) 1472 // v4 = store(..., v3) 1473 // 1474 // This program has 5 instructions v0-v4. 1475 // - v0 is used by v2 and v3 1476 // - v1 is used by v2 1477 // - v2 is used by v3 1478 // - v3 is used by v4 1479 // - v4 has a side-effect 1480 // 1481 // For this program we fill out these two arrays: 1482 // table: [v2,v3, v2, v3, v4] 1483 // index: [0, 2, 3, 4, 5] 1484 // 1485 // The table is just those "is used by ..." I wrote out above in order, 1486 // and the index tracks where an Instruction's span of users starts, table[index[id]]. 1487 // The span continues up until the start of the next Instruction, table[index[id+1]]. users(Val id) const1488 SkSpan<const Val> Usage::users(Val id) const { 1489 int begin = fIndex[id]; 1490 int end = fIndex[id + 1]; 1491 return SkMakeSpan(fTable.data() + begin, end - begin); 1492 } 1493 Usage(const std::vector<Instruction> & program,const std::vector<bool> & live)1494 Usage::Usage(const std::vector<Instruction>& program, const std::vector<bool>& live) { 1495 // uses[id] counts the number of times each Instruction is used. 1496 std::vector<int> uses(program.size(), 0); 1497 for (Val id = 0; id < (Val)program.size(); id++) { 1498 if (live[id]) { 1499 Instruction inst = program[id]; 1500 if (inst.x != NA) { ++uses[inst.x]; } 1501 if (inst.y != NA) { ++uses[inst.y]; } 1502 if (inst.z != NA) { ++uses[inst.z]; } 1503 } 1504 } 1505 1506 // Build our index into fTable, with an extra entry marking the final Instruction's end. 1507 fIndex.reserve(program.size() + 1); 1508 int total_uses = 0; 1509 for (int n : uses) { 1510 fIndex.push_back(total_uses); 1511 total_uses += n; 1512 } 1513 fIndex.push_back(total_uses); 1514 1515 // Tick down each Instruction's uses to fill in fTable. 1516 fTable.resize(total_uses, NA); 1517 for (Val id = (Val)program.size(); id --> 0; ) { 1518 if (live[id]) { 1519 Instruction inst = program[id]; 1520 if (inst.x != NA) { fTable[fIndex[inst.x] + --uses[inst.x]] = id; } 1521 if (inst.y != NA) { fTable[fIndex[inst.y] + --uses[inst.y]] = id; } 1522 if (inst.z != NA) { fTable[fIndex[inst.z] + --uses[inst.z]] = id; } 1523 } 1524 } 1525 for (int n : uses ) { (void)n; SkASSERT(n == 0 ); } 1526 for (Val id : fTable) { (void)id; SkASSERT(id != NA); } 1527 } 1528 1529 // ~~~~ Program::eval() and co. ~~~~ // 1530 1531 // Handy references for x86-64 instruction encoding: 1532 // https://wiki.osdev.org/X86-64_Instruction_Encoding 1533 // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm 1534 // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm 1535 // http://ref.x86asm.net/coder64.html 1536 1537 // Used for ModRM / immediate instruction encoding. _233(int a,int b,int c)1538 static uint8_t _233(int a, int b, int c) { 1539 return (a & 3) << 6 1540 | (b & 7) << 3 1541 | (c & 7) << 0; 1542 } 1543 1544 // ModRM byte encodes the arguments of an opcode. 1545 enum class Mod { Indirect, OneByteImm, FourByteImm, Direct }; mod_rm(Mod mod,int reg,int rm)1546 static uint8_t mod_rm(Mod mod, int reg, int rm) { 1547 return _233((int)mod, reg, rm); 1548 } 1549 mod(int imm)1550 static Mod mod(int imm) { 1551 if (imm == 0) { return Mod::Indirect; } 1552 if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; } 1553 return Mod::FourByteImm; 1554 } 1555 imm_bytes(Mod mod)1556 static int imm_bytes(Mod mod) { 1557 switch (mod) { 1558 case Mod::Indirect: return 0; 1559 case Mod::OneByteImm: return 1; 1560 case Mod::FourByteImm: return 4; 1561 case Mod::Direct: SkUNREACHABLE; 1562 } 1563 SkUNREACHABLE; 1564 } 1565 1566 // SIB byte encodes a memory address, base + (index * scale). sib(Assembler::Scale scale,int index,int base)1567 static uint8_t sib(Assembler::Scale scale, int index, int base) { 1568 return _233((int)scale, index, base); 1569 } 1570 1571 // The REX prefix is used to extend most old 32-bit instructions to 64-bit. rex(bool W,bool R,bool X,bool B)1572 static uint8_t rex(bool W, // If set, operation is 64-bit, otherwise default, usually 32-bit. 1573 bool R, // Extra top bit to select ModRM reg, registers 8-15. 1574 bool X, // Extra top bit for SIB index register. 1575 bool B) { // Extra top bit for SIB base or ModRM rm register. 1576 return 0b01000000 // Fixed 0100 for top four bits. 1577 | (W << 3) 1578 | (R << 2) 1579 | (X << 1) 1580 | (B << 0); 1581 } 1582 1583 1584 // The VEX prefix extends SSE operations to AVX. Used generally, even with XMM. 1585 struct VEX { 1586 int len; 1587 uint8_t bytes[3]; 1588 }; 1589 vex(bool WE,bool R,bool X,bool B,int map,int vvvv,bool L,int pp)1590 static VEX vex(bool WE, // Like REX W for int operations, or opcode extension for float? 1591 bool R, // Same as REX R. Pass high bit of dst register, dst>>3. 1592 bool X, // Same as REX X. 1593 bool B, // Same as REX B. Pass y>>3 for 3-arg ops, x>>3 for 2-arg. 1594 int map, // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f. 1595 int vvvv, // 4-bit second operand register. Pass our x for 3-arg ops. 1596 bool L, // Set for 256-bit ymm operations, off for 128-bit xmm. 1597 int pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none. 1598 1599 // Pack x86 opcode map selector to 5-bit VEX encoding. 1600 map = [map]{ 1601 switch (map) { 1602 case 0x0f: return 0b00001; 1603 case 0x380f: return 0b00010; 1604 case 0x3a0f: return 0b00011; 1605 // Several more cases only used by XOP / TBM. 1606 } 1607 SkUNREACHABLE; 1608 }(); 1609 1610 // Pack mandatory SSE opcode prefix byte to 2-bit VEX encoding. 1611 pp = [pp]{ 1612 switch (pp) { 1613 case 0x66: return 0b01; 1614 case 0xf3: return 0b10; 1615 case 0xf2: return 0b11; 1616 } 1617 return 0b00; 1618 }(); 1619 1620 VEX vex = {0, {0,0,0}}; 1621 if (X == 0 && B == 0 && WE == 0 && map == 0b00001) { 1622 // With these conditions met, we can optionally compress VEX to 2-byte. 1623 vex.len = 2; 1624 vex.bytes[0] = 0xc5; 1625 vex.bytes[1] = (pp & 3) << 0 1626 | (L & 1) << 2 1627 | (~vvvv & 15) << 3 1628 | (~(int)R & 1) << 7; 1629 } else { 1630 // We could use this 3-byte VEX prefix all the time if we like. 1631 vex.len = 3; 1632 vex.bytes[0] = 0xc4; 1633 vex.bytes[1] = (map & 31) << 0 1634 | (~(int)B & 1) << 5 1635 | (~(int)X & 1) << 6 1636 | (~(int)R & 1) << 7; 1637 vex.bytes[2] = (pp & 3) << 0 1638 | (L & 1) << 2 1639 | (~vvvv & 15) << 3 1640 | (WE & 1) << 7; 1641 } 1642 return vex; 1643 } 1644 Assembler(void * buf)1645 Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fCurr(fCode), fSize(0) {} 1646 size() const1647 size_t Assembler::size() const { return fSize; } 1648 bytes(const void * p,int n)1649 void Assembler::bytes(const void* p, int n) { 1650 if (fCurr) { 1651 memcpy(fCurr, p, n); 1652 fCurr += n; 1653 } 1654 fSize += n; 1655 } 1656 byte(uint8_t b)1657 void Assembler::byte(uint8_t b) { this->bytes(&b, 1); } word(uint32_t w)1658 void Assembler::word(uint32_t w) { this->bytes(&w, 4); } 1659 align(int mod)1660 void Assembler::align(int mod) { 1661 while (this->size() % mod) { 1662 this->byte(0x00); 1663 } 1664 } 1665 int3()1666 void Assembler::int3() { 1667 this->byte(0xcc); 1668 } 1669 vzeroupper()1670 void Assembler::vzeroupper() { 1671 this->byte(0xc5); 1672 this->byte(0xf8); 1673 this->byte(0x77); 1674 } ret()1675 void Assembler::ret() { this->byte(0xc3); } 1676 1677 // Common instruction building for 64-bit opcodes with an immediate argument. op(int opcode,int opcode_ext,GP64 dst,int imm)1678 void Assembler::op(int opcode, int opcode_ext, GP64 dst, int imm) { 1679 opcode |= 0b0000'0001; // low bit set for 64-bit operands 1680 opcode |= 0b1000'0000; // top bit set for instructions with any immediate 1681 1682 int imm_bytes = 4; 1683 if (SkTFitsIn<int8_t>(imm)) { 1684 imm_bytes = 1; 1685 opcode |= 0b0000'0010; // second bit set for 8-bit immediate, else 32-bit. 1686 } 1687 1688 this->byte(rex(1,0,0,dst>>3)); 1689 this->byte(opcode); 1690 this->byte(mod_rm(Mod::Direct, opcode_ext, dst&7)); 1691 this->bytes(&imm, imm_bytes); 1692 } 1693 add(GP64 dst,int imm)1694 void Assembler::add(GP64 dst, int imm) { this->op(0,0b000, dst,imm); } sub(GP64 dst,int imm)1695 void Assembler::sub(GP64 dst, int imm) { this->op(0,0b101, dst,imm); } cmp(GP64 reg,int imm)1696 void Assembler::cmp(GP64 reg, int imm) { this->op(0,0b111, reg,imm); } 1697 movq(GP64 dst,GP64 src,int off)1698 void Assembler::movq(GP64 dst, GP64 src, int off) { 1699 this->byte(rex(1,dst>>3,0,src>>3)); 1700 this->byte(0x8b); 1701 this->byte(mod_rm(mod(off), dst&7, src&7)); 1702 this->bytes(&off, imm_bytes(mod(off))); 1703 } 1704 op(int prefix,int map,int opcode,Ymm dst,Ymm x,Ymm y,bool W)1705 void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Ymm y, bool W/*=false*/) { 1706 VEX v = vex(W, dst>>3, 0, y>>3, 1707 map, x, 1/*ymm, not xmm*/, prefix); 1708 this->bytes(v.bytes, v.len); 1709 this->byte(opcode); 1710 this->byte(mod_rm(Mod::Direct, dst&7, y&7)); 1711 } 1712 vpaddd(Ymm dst,Ymm x,YmmOrLabel y)1713 void Assembler::vpaddd (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66, 0x0f,0xfe, dst,x,y); } vpsubd(Ymm dst,Ymm x,YmmOrLabel y)1714 void Assembler::vpsubd (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66, 0x0f,0xfa, dst,x,y); } vpmulld(Ymm dst,Ymm x,Ymm y)1715 void Assembler::vpmulld(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x40, dst,x,y); } 1716 vpsubw(Ymm dst,Ymm x,Ymm y)1717 void Assembler::vpsubw (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xf9, dst,x,y); } vpmullw(Ymm dst,Ymm x,Ymm y)1718 void Assembler::vpmullw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xd5, dst,x,y); } 1719 vpand(Ymm dst,Ymm x,YmmOrLabel y)1720 void Assembler::vpand (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,0x0f,0xdb, dst,x,y); } vpor(Ymm dst,Ymm x,YmmOrLabel y)1721 void Assembler::vpor (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,0x0f,0xeb, dst,x,y); } vpxor(Ymm dst,Ymm x,YmmOrLabel y)1722 void Assembler::vpxor (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,0x0f,0xef, dst,x,y); } vpandn(Ymm dst,Ymm x,Ymm y)1723 void Assembler::vpandn(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xdf, dst,x,y); } 1724 vaddps(Ymm dst,Ymm x,YmmOrLabel y)1725 void Assembler::vaddps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x58, dst,x,y); } vsubps(Ymm dst,Ymm x,YmmOrLabel y)1726 void Assembler::vsubps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x5c, dst,x,y); } vmulps(Ymm dst,Ymm x,YmmOrLabel y)1727 void Assembler::vmulps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x59, dst,x,y); } vdivps(Ymm dst,Ymm x,Ymm y)1728 void Assembler::vdivps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x5e, dst,x,y); } vminps(Ymm dst,Ymm x,YmmOrLabel y)1729 void Assembler::vminps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x5d, dst,x,y); } vmaxps(Ymm dst,Ymm x,YmmOrLabel y)1730 void Assembler::vmaxps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x5f, dst,x,y); } 1731 vfmadd132ps(Ymm dst,Ymm x,Ymm y)1732 void Assembler::vfmadd132ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x98, dst,x,y); } vfmadd213ps(Ymm dst,Ymm x,Ymm y)1733 void Assembler::vfmadd213ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xa8, dst,x,y); } vfmadd231ps(Ymm dst,Ymm x,Ymm y)1734 void Assembler::vfmadd231ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xb8, dst,x,y); } 1735 vfmsub132ps(Ymm dst,Ymm x,Ymm y)1736 void Assembler::vfmsub132ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x9a, dst,x,y); } vfmsub213ps(Ymm dst,Ymm x,Ymm y)1737 void Assembler::vfmsub213ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xaa, dst,x,y); } vfmsub231ps(Ymm dst,Ymm x,Ymm y)1738 void Assembler::vfmsub231ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xba, dst,x,y); } 1739 vfnmadd132ps(Ymm dst,Ymm x,Ymm y)1740 void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x9c, dst,x,y); } vfnmadd213ps(Ymm dst,Ymm x,Ymm y)1741 void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xac, dst,x,y); } vfnmadd231ps(Ymm dst,Ymm x,Ymm y)1742 void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xbc, dst,x,y); } 1743 vpackusdw(Ymm dst,Ymm x,Ymm y)1744 void Assembler::vpackusdw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x2b, dst,x,y); } vpackuswb(Ymm dst,Ymm x,Ymm y)1745 void Assembler::vpackuswb(Ymm dst, Ymm x, Ymm y) { this->op(0x66, 0x0f,0x67, dst,x,y); } 1746 vpcmpeqd(Ymm dst,Ymm x,Ymm y)1747 void Assembler::vpcmpeqd(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0x76, dst,x,y); } vpcmpgtd(Ymm dst,Ymm x,Ymm y)1748 void Assembler::vpcmpgtd(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0x66, dst,x,y); } 1749 vcmpps(Ymm dst,Ymm x,Ymm y,int imm)1750 void Assembler::vcmpps(Ymm dst, Ymm x, Ymm y, int imm) { 1751 this->op(0,0x0f,0xc2, dst,x,y); 1752 this->byte(imm); 1753 } 1754 vpblendvb(Ymm dst,Ymm x,Ymm y,Ymm z)1755 void Assembler::vpblendvb(Ymm dst, Ymm x, Ymm y, Ymm z) { 1756 int prefix = 0x66, 1757 map = 0x3a0f, 1758 opcode = 0x4c; 1759 VEX v = vex(0, dst>>3, 0, y>>3, 1760 map, x, /*ymm?*/1, prefix); 1761 this->bytes(v.bytes, v.len); 1762 this->byte(opcode); 1763 this->byte(mod_rm(Mod::Direct, dst&7, y&7)); 1764 this->byte(z << 4); 1765 } 1766 1767 // dst = x op /opcode_ext imm op(int prefix,int map,int opcode,int opcode_ext,Ymm dst,Ymm x,int imm)1768 void Assembler::op(int prefix, int map, int opcode, int opcode_ext, Ymm dst, Ymm x, int imm) { 1769 // This is a little weird, but if we pass the opcode_ext as if it were the dst register, 1770 // the dst register as if x, and the x register as if y, all the bits end up where we want. 1771 this->op(prefix, map, opcode, (Ymm)opcode_ext,dst,x); 1772 this->byte(imm); 1773 } 1774 vpslld(Ymm dst,Ymm x,int imm)1775 void Assembler::vpslld(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,6, dst,x,imm); } vpsrld(Ymm dst,Ymm x,int imm)1776 void Assembler::vpsrld(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,2, dst,x,imm); } vpsrad(Ymm dst,Ymm x,int imm)1777 void Assembler::vpsrad(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,4, dst,x,imm); } 1778 vpsrlw(Ymm dst,Ymm x,int imm)1779 void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x71,2, dst,x,imm); } 1780 1781 vpermq(Ymm dst,Ymm x,int imm)1782 void Assembler::vpermq(Ymm dst, Ymm x, int imm) { 1783 // A bit unusual among the instructions we use, this is 64-bit operation, so we set W. 1784 bool W = true; 1785 this->op(0x66,0x3a0f,0x00, dst,x,W); 1786 this->byte(imm); 1787 } 1788 vroundps(Ymm dst,Ymm x,int imm)1789 void Assembler::vroundps(Ymm dst, Ymm x, int imm) { 1790 this->op(0x66,0x3a0f,0x08, dst,x); 1791 this->byte(imm); 1792 } 1793 vmovdqa(Ymm dst,Ymm src)1794 void Assembler::vmovdqa(Ymm dst, Ymm src) { this->op(0x66,0x0f,0x6f, dst,src); } 1795 vcvtdq2ps(Ymm dst,Ymm x)1796 void Assembler::vcvtdq2ps (Ymm dst, Ymm x) { this->op( 0,0x0f,0x5b, dst,x); } vcvttps2dq(Ymm dst,Ymm x)1797 void Assembler::vcvttps2dq(Ymm dst, Ymm x) { this->op(0xf3,0x0f,0x5b, dst,x); } vcvtps2dq(Ymm dst,Ymm x)1798 void Assembler::vcvtps2dq (Ymm dst, Ymm x) { this->op(0x66,0x0f,0x5b, dst,x); } vsqrtps(Ymm dst,Ymm x)1799 void Assembler::vsqrtps (Ymm dst, Ymm x) { this->op( 0,0x0f,0x51, dst,x); } 1800 here()1801 Assembler::Label Assembler::here() { 1802 return { (int)this->size(), Label::NotYetSet, {} }; 1803 } 1804 disp19(Label * l)1805 int Assembler::disp19(Label* l) { 1806 SkASSERT(l->kind == Label::NotYetSet || 1807 l->kind == Label::ARMDisp19); 1808 l->kind = Label::ARMDisp19; 1809 l->references.push_back(here().offset); 1810 // ARM 19-bit instruction count, from the beginning of this instruction. 1811 return (l->offset - here().offset) / 4; 1812 } 1813 disp32(Label * l)1814 int Assembler::disp32(Label* l) { 1815 SkASSERT(l->kind == Label::NotYetSet || 1816 l->kind == Label::X86Disp32); 1817 l->kind = Label::X86Disp32; 1818 l->references.push_back(here().offset); 1819 // x86 32-bit byte count, from the end of this instruction. 1820 return l->offset - (here().offset + 4); 1821 } 1822 op(int prefix,int map,int opcode,Ymm dst,Ymm x,Label * l)1823 void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Label* l) { 1824 // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13. 1825 const int rip = rbp; 1826 1827 VEX v = vex(0, dst>>3, 0, rip>>3, 1828 map, x, /*ymm?*/1, prefix); 1829 this->bytes(v.bytes, v.len); 1830 this->byte(opcode); 1831 this->byte(mod_rm(Mod::Indirect, dst&7, rip&7)); 1832 this->word(this->disp32(l)); 1833 } 1834 op(int prefix,int map,int opcode,Ymm dst,Ymm x,YmmOrLabel y)1835 void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, YmmOrLabel y) { 1836 y.label ? this->op(prefix,map,opcode,dst,x, y.label) 1837 : this->op(prefix,map,opcode,dst,x, y.ymm ); 1838 } 1839 vpshufb(Ymm dst,Ymm x,Label * l)1840 void Assembler::vpshufb(Ymm dst, Ymm x, Label* l) { this->op(0x66,0x380f,0x00, dst,x,l); } vptest(Ymm dst,Label * l)1841 void Assembler::vptest(Ymm dst, Label* l) { this->op(0x66, 0x380f, 0x17, dst, (Ymm)0, l); } 1842 vbroadcastss(Ymm dst,Label * l)1843 void Assembler::vbroadcastss(Ymm dst, Label* l) { this->op(0x66,0x380f,0x18, dst, (Ymm)0, l); } vbroadcastss(Ymm dst,Xmm src)1844 void Assembler::vbroadcastss(Ymm dst, Xmm src) { this->op(0x66,0x380f,0x18, dst, (Ymm)src); } vbroadcastss(Ymm dst,GP64 ptr,int off)1845 void Assembler::vbroadcastss(Ymm dst, GP64 ptr, int off) { 1846 int prefix = 0x66, 1847 map = 0x380f, 1848 opcode = 0x18; 1849 VEX v = vex(0, dst>>3, 0, ptr>>3, 1850 map, 0, /*ymm?*/1, prefix); 1851 this->bytes(v.bytes, v.len); 1852 this->byte(opcode); 1853 1854 this->byte(mod_rm(mod(off), dst&7, ptr&7)); 1855 this->bytes(&off, imm_bytes(mod(off))); 1856 } 1857 jump(uint8_t condition,Label * l)1858 void Assembler::jump(uint8_t condition, Label* l) { 1859 // These conditional jumps can be either 2 bytes (short) or 6 bytes (near): 1860 // 7? one-byte-disp 1861 // 0F 8? four-byte-disp 1862 // We always use the near displacement to make updating labels simpler (no resizing). 1863 this->byte(0x0f); 1864 this->byte(condition); 1865 this->word(this->disp32(l)); 1866 } je(Label * l)1867 void Assembler::je (Label* l) { this->jump(0x84, l); } jne(Label * l)1868 void Assembler::jne(Label* l) { this->jump(0x85, l); } jl(Label * l)1869 void Assembler::jl (Label* l) { this->jump(0x8c, l); } jc(Label * l)1870 void Assembler::jc (Label* l) { this->jump(0x82, l); } 1871 jmp(Label * l)1872 void Assembler::jmp(Label* l) { 1873 // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit. 1874 this->byte(0xe9); 1875 this->word(this->disp32(l)); 1876 } 1877 load_store(int prefix,int map,int opcode,Ymm ymm,GP64 ptr)1878 void Assembler::load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr) { 1879 VEX v = vex(0, ymm>>3, 0, ptr>>3, 1880 map, 0, /*ymm?*/1, prefix); 1881 this->bytes(v.bytes, v.len); 1882 this->byte(opcode); 1883 this->byte(mod_rm(Mod::Indirect, ymm&7, ptr&7)); 1884 } 1885 vmovups(Ymm dst,GP64 src)1886 void Assembler::vmovups (Ymm dst, GP64 src) { this->load_store(0 , 0x0f,0x10, dst,src); } vpmovzxwd(Ymm dst,GP64 src)1887 void Assembler::vpmovzxwd(Ymm dst, GP64 src) { this->load_store(0x66,0x380f,0x33, dst,src); } vpmovzxbd(Ymm dst,GP64 src)1888 void Assembler::vpmovzxbd(Ymm dst, GP64 src) { this->load_store(0x66,0x380f,0x31, dst,src); } 1889 vmovups(GP64 dst,Ymm src)1890 void Assembler::vmovups (GP64 dst, Ymm src) { this->load_store(0 , 0x0f,0x11, src,dst); } vmovups(GP64 dst,Xmm src)1891 void Assembler::vmovups (GP64 dst, Xmm src) { 1892 // Same as vmovups(GP64,YMM) and load_store() except ymm? is 0. 1893 int prefix = 0, 1894 map = 0x0f, 1895 opcode = 0x11; 1896 VEX v = vex(0, src>>3, 0, dst>>3, 1897 map, 0, /*ymm?*/0, prefix); 1898 this->bytes(v.bytes, v.len); 1899 this->byte(opcode); 1900 this->byte(mod_rm(Mod::Indirect, src&7, dst&7)); 1901 } 1902 vmovq(GP64 dst,Xmm src)1903 void Assembler::vmovq(GP64 dst, Xmm src) { 1904 int prefix = 0x66, 1905 map = 0x0f, 1906 opcode = 0xd6; 1907 VEX v = vex(0, src>>3, 0, dst>>3, 1908 map, 0, /*ymm?*/0, prefix); 1909 this->bytes(v.bytes, v.len); 1910 this->byte(opcode); 1911 this->byte(mod_rm(Mod::Indirect, src&7, dst&7)); 1912 } 1913 vmovd(GP64 dst,Xmm src)1914 void Assembler::vmovd(GP64 dst, Xmm src) { 1915 int prefix = 0x66, 1916 map = 0x0f, 1917 opcode = 0x7e; 1918 VEX v = vex(0, src>>3, 0, dst>>3, 1919 map, 0, /*ymm?*/0, prefix); 1920 this->bytes(v.bytes, v.len); 1921 this->byte(opcode); 1922 this->byte(mod_rm(Mod::Indirect, src&7, dst&7)); 1923 } 1924 vmovd_direct(GP64 dst,Xmm src)1925 void Assembler::vmovd_direct(GP64 dst, Xmm src) { 1926 int prefix = 0x66, 1927 map = 0x0f, 1928 opcode = 0x7e; 1929 VEX v = vex(0, src>>3, 0, dst>>3, 1930 map, 0, /*ymm?*/0, prefix); 1931 this->bytes(v.bytes, v.len); 1932 this->byte(opcode); 1933 this->byte(mod_rm(Mod::Direct, src&7, dst&7)); 1934 } 1935 vmovd(Xmm dst,GP64 src)1936 void Assembler::vmovd(Xmm dst, GP64 src) { 1937 int prefix = 0x66, 1938 map = 0x0f, 1939 opcode = 0x6e; 1940 VEX v = vex(0, dst>>3, 0, src>>3, 1941 map, 0, /*ymm?*/0, prefix); 1942 this->bytes(v.bytes, v.len); 1943 this->byte(opcode); 1944 this->byte(mod_rm(Mod::Indirect, dst&7, src&7)); 1945 } 1946 vmovd(Xmm dst,Scale scale,GP64 index,GP64 base)1947 void Assembler::vmovd(Xmm dst, Scale scale, GP64 index, GP64 base) { 1948 int prefix = 0x66, 1949 map = 0x0f, 1950 opcode = 0x6e; 1951 VEX v = vex(0, dst>>3, index>>3, base>>3, 1952 map, 0, /*ymm?*/0, prefix); 1953 this->bytes(v.bytes, v.len); 1954 this->byte(opcode); 1955 this->byte(mod_rm(Mod::Indirect, dst&7, rsp)); 1956 this->byte(sib(scale, index&7, base&7)); 1957 } 1958 vmovd_direct(Xmm dst,GP64 src)1959 void Assembler::vmovd_direct(Xmm dst, GP64 src) { 1960 int prefix = 0x66, 1961 map = 0x0f, 1962 opcode = 0x6e; 1963 VEX v = vex(0, dst>>3, 0, src>>3, 1964 map, 0, /*ymm?*/0, prefix); 1965 this->bytes(v.bytes, v.len); 1966 this->byte(opcode); 1967 this->byte(mod_rm(Mod::Direct, dst&7, src&7)); 1968 } 1969 movzbl(GP64 dst,GP64 src,int off)1970 void Assembler::movzbl(GP64 dst, GP64 src, int off) { 1971 if ((dst>>3) || (src>>3)) { 1972 this->byte(rex(0,dst>>3,0,src>>3)); 1973 } 1974 this->byte(0x0f); 1975 this->byte(0xb6); 1976 this->byte(mod_rm(mod(off), dst&7, src&7)); 1977 this->bytes(&off, imm_bytes(mod(off))); 1978 } 1979 1980 movb(GP64 dst,GP64 src)1981 void Assembler::movb(GP64 dst, GP64 src) { 1982 if ((dst>>3) || (src>>3)) { 1983 this->byte(rex(0,src>>3,0,dst>>3)); 1984 } 1985 this->byte(0x88); 1986 this->byte(mod_rm(Mod::Indirect, src&7, dst&7)); 1987 } 1988 vpinsrw(Xmm dst,Xmm src,GP64 ptr,int imm)1989 void Assembler::vpinsrw(Xmm dst, Xmm src, GP64 ptr, int imm) { 1990 int prefix = 0x66, 1991 map = 0x0f, 1992 opcode = 0xc4; 1993 VEX v = vex(0, dst>>3, 0, ptr>>3, 1994 map, src, /*ymm?*/0, prefix); 1995 this->bytes(v.bytes, v.len); 1996 this->byte(opcode); 1997 this->byte(mod_rm(Mod::Indirect, dst&7, ptr&7)); 1998 this->byte(imm); 1999 } 2000 vpinsrb(Xmm dst,Xmm src,GP64 ptr,int imm)2001 void Assembler::vpinsrb(Xmm dst, Xmm src, GP64 ptr, int imm) { 2002 int prefix = 0x66, 2003 map = 0x3a0f, 2004 opcode = 0x20; 2005 VEX v = vex(0, dst>>3, 0, ptr>>3, 2006 map, src, /*ymm?*/0, prefix); 2007 this->bytes(v.bytes, v.len); 2008 this->byte(opcode); 2009 this->byte(mod_rm(Mod::Indirect, dst&7, ptr&7)); 2010 this->byte(imm); 2011 } 2012 vpextrw(GP64 ptr,Xmm src,int imm)2013 void Assembler::vpextrw(GP64 ptr, Xmm src, int imm) { 2014 int prefix = 0x66, 2015 map = 0x3a0f, 2016 opcode = 0x15; 2017 2018 VEX v = vex(0, src>>3, 0, ptr>>3, 2019 map, 0, /*ymm?*/0, prefix); 2020 this->bytes(v.bytes, v.len); 2021 this->byte(opcode); 2022 this->byte(mod_rm(Mod::Indirect, src&7, ptr&7)); 2023 this->byte(imm); 2024 } vpextrb(GP64 ptr,Xmm src,int imm)2025 void Assembler::vpextrb(GP64 ptr, Xmm src, int imm) { 2026 int prefix = 0x66, 2027 map = 0x3a0f, 2028 opcode = 0x14; 2029 2030 VEX v = vex(0, src>>3, 0, ptr>>3, 2031 map, 0, /*ymm?*/0, prefix); 2032 this->bytes(v.bytes, v.len); 2033 this->byte(opcode); 2034 this->byte(mod_rm(Mod::Indirect, src&7, ptr&7)); 2035 this->byte(imm); 2036 } 2037 vgatherdps(Ymm dst,Scale scale,Ymm ix,GP64 base,Ymm mask)2038 void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) { 2039 // Unlike most instructions, no aliasing is permitted here. 2040 SkASSERT(dst != ix); 2041 SkASSERT(dst != mask); 2042 SkASSERT(mask != ix); 2043 2044 int prefix = 0x66, 2045 map = 0x380f, 2046 opcode = 0x92; 2047 VEX v = vex(0, dst>>3, ix>>3, base>>3, 2048 map, mask, /*ymm?*/1, prefix); 2049 this->bytes(v.bytes, v.len); 2050 this->byte(opcode); 2051 this->byte(mod_rm(Mod::Indirect, dst&7, rsp)); 2052 this->byte(sib(scale, ix&7, base&7)); 2053 } 2054 2055 // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf 2056 operator ""_mask(unsigned long long bits)2057 static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; } 2058 op(uint32_t hi,V m,uint32_t lo,V n,V d)2059 void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) { 2060 this->word( (hi & 11_mask) << 21 2061 | (m & 5_mask) << 16 2062 | (lo & 6_mask) << 10 2063 | (n & 5_mask) << 5 2064 | (d & 5_mask) << 0); 2065 } 2066 and16b(V d,V n,V m)2067 void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); } orr16b(V d,V n,V m)2068 void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); } eor16b(V d,V n,V m)2069 void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); } bic16b(V d,V n,V m)2070 void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); } bsl16b(V d,V n,V m)2071 void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); } not16b(V d,V n)2072 void Assembler::not16b(V d, V n) { this->op(0b0'1'1'01110'00'10000'00101'10, n, d); } 2073 add4s(V d,V n,V m)2074 void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); } sub4s(V d,V n,V m)2075 void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); } mul4s(V d,V n,V m)2076 void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); } 2077 cmeq4s(V d,V n,V m)2078 void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); } cmgt4s(V d,V n,V m)2079 void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); } 2080 sub8h(V d,V n,V m)2081 void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); } mul8h(V d,V n,V m)2082 void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); } 2083 fadd4s(V d,V n,V m)2084 void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); } fsub4s(V d,V n,V m)2085 void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); } fmul4s(V d,V n,V m)2086 void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); } fdiv4s(V d,V n,V m)2087 void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); } fmin4s(V d,V n,V m)2088 void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); } fmax4s(V d,V n,V m)2089 void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); } fneg4s(V d,V n)2090 void Assembler::fneg4s(V d, V n) { this->op(0b0'1'1'01110'1'0'10000'01111'10, n, d); } 2091 fcmeq4s(V d,V n,V m)2092 void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); } fcmgt4s(V d,V n,V m)2093 void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); } fcmge4s(V d,V n,V m)2094 void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); } 2095 fmla4s(V d,V n,V m)2096 void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); } fmls4s(V d,V n,V m)2097 void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); } 2098 tbl(V d,V n,V m)2099 void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); } 2100 op(uint32_t op22,int imm,V n,V d)2101 void Assembler::op(uint32_t op22, int imm, V n, V d) { 2102 this->word( (op22 & 22_mask) << 10 2103 | imm << 16 // imm is embedded inside op, bit size depends on op 2104 | (n & 5_mask) << 5 2105 | (d & 5_mask) << 0); 2106 } 2107 sli4s(V d,V n,int imm)2108 void Assembler::sli4s(V d, V n, int imm) { 2109 this->op(0b0'1'1'011110'0100'000'01010'1, ( imm&31), n, d); 2110 } shl4s(V d,V n,int imm)2111 void Assembler::shl4s(V d, V n, int imm) { 2112 this->op(0b0'1'0'011110'0100'000'01010'1, ( imm&31), n, d); 2113 } sshr4s(V d,V n,int imm)2114 void Assembler::sshr4s(V d, V n, int imm) { 2115 this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, (-imm&31), n, d); 2116 } ushr4s(V d,V n,int imm)2117 void Assembler::ushr4s(V d, V n, int imm) { 2118 this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, (-imm&31), n, d); 2119 } ushr8h(V d,V n,int imm)2120 void Assembler::ushr8h(V d, V n, int imm) { 2121 this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, (-imm&15), n, d); 2122 } 2123 scvtf4s(V d,V n)2124 void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); } fcvtzs4s(V d,V n)2125 void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); } fcvtns4s(V d,V n)2126 void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); } 2127 xtns2h(V d,V n)2128 void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); } xtnh2b(V d,V n)2129 void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); } 2130 uxtlb2h(V d,V n)2131 void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); } uxtlh2s(V d,V n)2132 void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); } 2133 uminv4s(V d,V n)2134 void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); } 2135 brk(int imm16)2136 void Assembler::brk(int imm16) { 2137 this->word(0b11010100'001'0000000000000000'000'00 2138 | (imm16 & 16_mask) << 5); 2139 } 2140 ret(X n)2141 void Assembler::ret(X n) { 2142 this->word(0b1101011'0'0'10'11111'0000'0'0 << 10 2143 | (n & 5_mask) << 5); 2144 } 2145 add(X d,X n,int imm12)2146 void Assembler::add(X d, X n, int imm12) { 2147 this->word(0b1'0'0'10001'00 << 22 2148 | (imm12 & 12_mask) << 10 2149 | (n & 5_mask) << 5 2150 | (d & 5_mask) << 0); 2151 } sub(X d,X n,int imm12)2152 void Assembler::sub(X d, X n, int imm12) { 2153 this->word( 0b1'1'0'10001'00 << 22 2154 | (imm12 & 12_mask) << 10 2155 | (n & 5_mask) << 5 2156 | (d & 5_mask) << 0); 2157 } subs(X d,X n,int imm12)2158 void Assembler::subs(X d, X n, int imm12) { 2159 this->word( 0b1'1'1'10001'00 << 22 2160 | (imm12 & 12_mask) << 10 2161 | (n & 5_mask) << 5 2162 | (d & 5_mask) << 0); 2163 } 2164 b(Condition cond,Label * l)2165 void Assembler::b(Condition cond, Label* l) { 2166 const int imm19 = this->disp19(l); 2167 this->word( 0b0101010'0 << 24 2168 | (imm19 & 19_mask) << 5 2169 | ((int)cond & 4_mask) << 0); 2170 } cbz(X t,Label * l)2171 void Assembler::cbz(X t, Label* l) { 2172 const int imm19 = this->disp19(l); 2173 this->word( 0b1'011010'0 << 24 2174 | (imm19 & 19_mask) << 5 2175 | (t & 5_mask) << 0); 2176 } cbnz(X t,Label * l)2177 void Assembler::cbnz(X t, Label* l) { 2178 const int imm19 = this->disp19(l); 2179 this->word( 0b1'011010'1 << 24 2180 | (imm19 & 19_mask) << 5 2181 | (t & 5_mask) << 0); 2182 } 2183 ldrq(V dst,X src)2184 void Assembler::ldrq(V dst, X src) { this->op(0b00'111'1'01'11'000000000000, src, dst); } ldrs(V dst,X src)2185 void Assembler::ldrs(V dst, X src) { this->op(0b10'111'1'01'01'000000000000, src, dst); } ldrb(V dst,X src)2186 void Assembler::ldrb(V dst, X src) { this->op(0b00'111'1'01'01'000000000000, src, dst); } 2187 strq(V src,X dst)2188 void Assembler::strq(V src, X dst) { this->op(0b00'111'1'01'10'000000000000, dst, src); } strs(V src,X dst)2189 void Assembler::strs(V src, X dst) { this->op(0b10'111'1'01'00'000000000000, dst, src); } strb(V src,X dst)2190 void Assembler::strb(V src, X dst) { this->op(0b00'111'1'01'00'000000000000, dst, src); } 2191 fmovs(X dst,V src)2192 void Assembler::fmovs(X dst, V src) { 2193 this->word(0b0'0'0'11110'00'1'00'110'000000 << 10 2194 | (src & 5_mask) << 5 2195 | (dst & 5_mask) << 0); 2196 } 2197 ldrq(V dst,Label * l)2198 void Assembler::ldrq(V dst, Label* l) { 2199 const int imm19 = this->disp19(l); 2200 this->word( 0b10'011'1'00 << 24 2201 | (imm19 & 19_mask) << 5 2202 | (dst & 5_mask) << 0); 2203 } 2204 label(Label * l)2205 void Assembler::label(Label* l) { 2206 if (fCode) { 2207 // The instructions all currently point to l->offset. 2208 // We'll want to add a delta to point them to here(). 2209 int delta = here().offset - l->offset; 2210 l->offset = here().offset; 2211 2212 if (l->kind == Label::ARMDisp19) { 2213 for (int ref : l->references) { 2214 // ref points to a 32-bit instruction with 19-bit displacement in instructions. 2215 uint32_t inst; 2216 memcpy(&inst, fCode + ref, 4); 2217 2218 // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ] 2219 int disp = (int)(inst << 8) >> 13; 2220 2221 disp += delta/4; // delta is in bytes, we want instructions. 2222 2223 // Put it all back together, preserving the high 8 bits and low 5. 2224 inst = ((disp << 5) & (19_mask << 5)) 2225 | ((inst ) & ~(19_mask << 5)); 2226 2227 memcpy(fCode + ref, &inst, 4); 2228 } 2229 } 2230 2231 if (l->kind == Label::X86Disp32) { 2232 for (int ref : l->references) { 2233 // ref points to a 32-bit displacement in bytes. 2234 int disp; 2235 memcpy(&disp, fCode + ref, 4); 2236 2237 disp += delta; 2238 2239 memcpy(fCode + ref, &disp, 4); 2240 } 2241 } 2242 } 2243 } 2244 eval(int n,void * args[]) const2245 void Program::eval(int n, void* args[]) const { 2246 #define SKVM_JIT_STATS 0 2247 #if SKVM_JIT_STATS 2248 static std::atomic<int64_t> calls{0}, jits{0}, 2249 pixels{0}, fast{0}; 2250 pixels += n; 2251 if (0 == calls++) { 2252 atexit([]{ 2253 int64_t num = jits .load(), 2254 den = calls.load(); 2255 SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (100.0 * num)/den, den); 2256 num = fast .load(); 2257 den = pixels.load(); 2258 SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (100.0 * num)/den, den); 2259 }); 2260 } 2261 #endif 2262 // This may fail either simply because we can't JIT, or when using LLVM, 2263 // because the work represented by fImpl->llvm_compiling hasn't finished yet. 2264 if (const void* b = fImpl->jit_entry.load()) { 2265 #if SKVM_JIT_STATS 2266 jits++; 2267 fast += n; 2268 #endif 2269 void** a = args; 2270 switch (fImpl->strides.size()) { 2271 case 0: return ((void(*)(int ))b)(n ); 2272 case 1: return ((void(*)(int,void* ))b)(n,a[0] ); 2273 case 2: return ((void(*)(int,void*,void* ))b)(n,a[0],a[1] ); 2274 case 3: return ((void(*)(int,void*,void*,void* ))b)(n,a[0],a[1],a[2] ); 2275 case 4: return ((void(*)(int,void*,void*,void*,void*))b)(n,a[0],a[1],a[2],a[3]); 2276 case 5: return ((void(*)(int,void*,void*,void*,void*,void*))b) 2277 (n,a[0],a[1],a[2],a[3],a[4]); 2278 default: SkUNREACHABLE; // TODO 2279 } 2280 } 2281 2282 // So we'll sometimes use the interpreter here even if later calls will use the JIT. 2283 SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(), 2284 this->nregs(), this->loop(), fImpl->strides.data(), this->nargs(), 2285 n, args); 2286 } 2287 2288 #if defined(SKVM_LLVM) setupLLVM(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)2289 void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions, 2290 const char* debug_name) { 2291 auto ctx = std::make_unique<llvm::LLVMContext>(); 2292 2293 auto mod = std::make_unique<llvm::Module>("", *ctx); 2294 // All the scary bare pointers from here on are owned by ctx or mod, I think. 2295 2296 // Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines. 2297 const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? 8 : 4; 2298 2299 llvm::Type *ptr = llvm::Type::getInt8Ty(*ctx)->getPointerTo(), 2300 *i32 = llvm::Type::getInt32Ty(*ctx); 2301 2302 std::vector<llvm::Type*> arg_types = { i32 }; 2303 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2304 arg_types.push_back(ptr); 2305 } 2306 2307 llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx), 2308 arg_types, /*vararg?=*/false); 2309 llvm::Function* fn 2310 = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod); 2311 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2312 fn->addParamAttr(i+1, llvm::Attribute::NoAlias); 2313 } 2314 2315 llvm::BasicBlock *enter = llvm::BasicBlock::Create(*ctx, "enter" , fn), 2316 *hoistK = llvm::BasicBlock::Create(*ctx, "hoistK", fn), 2317 *testK = llvm::BasicBlock::Create(*ctx, "testK" , fn), 2318 *loopK = llvm::BasicBlock::Create(*ctx, "loopK" , fn), 2319 *hoist1 = llvm::BasicBlock::Create(*ctx, "hoist1", fn), 2320 *test1 = llvm::BasicBlock::Create(*ctx, "test1" , fn), 2321 *loop1 = llvm::BasicBlock::Create(*ctx, "loop1" , fn), 2322 *leave = llvm::BasicBlock::Create(*ctx, "leave" , fn); 2323 2324 using IRBuilder = llvm::IRBuilder<>; 2325 2326 llvm::PHINode* n; 2327 std::vector<llvm::PHINode*> args; 2328 std::vector<llvm::Value*> vals(instructions.size()); 2329 2330 auto emit = [&](size_t i, bool scalar, IRBuilder* b) { 2331 auto [op, x,y,z, immy,immz, death,can_hoist,used_in_loop] = instructions[i]; 2332 2333 llvm::Type *i1 = llvm::Type::getInt1Ty (*ctx), 2334 *i8 = llvm::Type::getInt8Ty (*ctx), 2335 *i8x4 = llvm::VectorType::get(i8, 4), 2336 *i16 = llvm::Type::getInt16Ty(*ctx), 2337 *i16x2 = llvm::VectorType::get(i16, 2), 2338 *f32 = llvm::Type::getFloatTy(*ctx), 2339 *I1 = scalar ? i1 : llvm::VectorType::get(i1 , K ), 2340 *I8 = scalar ? i8 : llvm::VectorType::get(i8 , K ), 2341 *I8x4 = scalar ? i8x4 : llvm::VectorType::get(i8 , K*4), 2342 *I16 = scalar ? i16 : llvm::VectorType::get(i16, K ), 2343 *I16x2 = scalar ? i16x2 : llvm::VectorType::get(i16, K*2), 2344 *I32 = scalar ? i32 : llvm::VectorType::get(i32, K ), 2345 *F32 = scalar ? f32 : llvm::VectorType::get(f32, K ); 2346 2347 auto I = [&](llvm::Value* v) { return b->CreateBitCast(v, I32 ); }; 2348 auto F = [&](llvm::Value* v) { return b->CreateBitCast(v, F32 ); }; 2349 auto x2 = [&](llvm::Value* v) { return b->CreateBitCast(v, I16x2); }; 2350 2351 auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); }; 2352 2353 switch (llvm::Type* t = nullptr; op) { 2354 default: 2355 SkDebugf("can't llvm %s (%d)\n", name(op), op); 2356 return false; 2357 2358 case Op::assert_true: /*TODO*/ break; 2359 2360 case Op::index: 2361 if (I32->isVectorTy()) { 2362 std::vector<llvm::Constant*> iota(K); 2363 for (int j = 0; j < K; j++) { 2364 iota[j] = b->getInt32(j); 2365 } 2366 vals[i] = b->CreateSub(b->CreateVectorSplat(K, n), 2367 llvm::ConstantVector::get(iota)); 2368 } else { 2369 vals[i] = n; 2370 } break; 2371 2372 case Op::load8: t = I8 ; goto load; 2373 case Op::load16: t = I16; goto load; 2374 case Op::load32: t = I32; goto load; 2375 load: { 2376 llvm::Value* ptr = b->CreateBitCast(args[immy], t->getPointerTo()); 2377 vals[i] = b->CreateZExt(b->CreateAlignedLoad(ptr, 1), I32); 2378 } break; 2379 2380 2381 case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immy); break; 2382 2383 case Op::uniform8: t = i8 ; goto uniform; 2384 case Op::uniform16: t = i16; goto uniform; 2385 case Op::uniform32: t = i32; goto uniform; 2386 uniform: { 2387 llvm::Value* ptr = b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr, 2388 args[immy], 2389 immz), 2390 t->getPointerTo()); 2391 llvm::Value* val = b->CreateZExt(b->CreateAlignedLoad(ptr, 1), i32); 2392 vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val) 2393 : val; 2394 } break; 2395 2396 case Op::gather8: t = i8 ; goto gather; 2397 case Op::gather16: t = i16; goto gather; 2398 case Op::gather32: t = i32; goto gather; 2399 gather: { 2400 // Our gather base pointer is immz bytes off of uniform immy. 2401 llvm::Value* base = 2402 b->CreateLoad(b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr, 2403 args[immy], 2404 immz), 2405 t->getPointerTo()->getPointerTo())); 2406 2407 llvm::Value* ptr = b->CreateInBoundsGEP(nullptr, base, vals[x]); 2408 llvm::Value* gathered; 2409 if (ptr->getType()->isVectorTy()) { 2410 gathered = b->CreateMaskedGather(ptr, 1); 2411 } else { 2412 gathered = b->CreateAlignedLoad(ptr, 1); 2413 } 2414 vals[i] = b->CreateZExt(gathered, I32); 2415 } break; 2416 2417 case Op::store8: t = I8 ; goto store; 2418 case Op::store16: t = I16; goto store; 2419 case Op::store32: t = I32; goto store; 2420 store: { 2421 llvm::Value* val = b->CreateTrunc(vals[x], t); 2422 llvm::Value* ptr = b->CreateBitCast(args[immy], 2423 val->getType()->getPointerTo()); 2424 vals[i] = b->CreateAlignedStore(val, ptr, 1); 2425 } break; 2426 2427 case Op::bit_and: vals[i] = b->CreateAnd(vals[x], vals[y]); break; 2428 case Op::bit_or : vals[i] = b->CreateOr (vals[x], vals[y]); break; 2429 case Op::bit_xor: vals[i] = b->CreateXor(vals[x], vals[y]); break; 2430 case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break; 2431 2432 case Op::pack: vals[i] = b->CreateOr(vals[x], b->CreateShl(vals[y], immz)); break; 2433 2434 case Op::select: 2435 vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]); 2436 break; 2437 2438 case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break; 2439 case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break; 2440 case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break; 2441 2442 case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immy); break; 2443 case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immy); break; 2444 case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immy); break; 2445 2446 case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break; 2447 case Op::neq_i32: vals[i] = S(I32, b->CreateICmpNE (vals[x], vals[y])); break; 2448 case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break; 2449 case Op::gte_i32: vals[i] = S(I32, b->CreateICmpSGE(vals[x], vals[y])); break; 2450 2451 case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break; 2452 case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break; 2453 case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break; 2454 case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break; 2455 2456 case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break; 2457 case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break; 2458 case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break; 2459 case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break; 2460 2461 case Op::fma_f32: 2462 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, 2463 {F(vals[x]), F(vals[y]), F(vals[z])})); 2464 break; 2465 2466 case Op::fms_f32: 2467 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, 2468 {F(vals[x]), F(vals[y]), 2469 b->CreateFNeg(F(vals[z]))})); 2470 break; 2471 2472 case Op::fnma_f32: 2473 vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32}, 2474 {b->CreateFNeg(F(vals[x])), F(vals[y]), 2475 F(vals[z])})); 2476 break; 2477 2478 case Op::floor: 2479 vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x]))); 2480 break; 2481 2482 case Op::max_f32: 2483 vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])), 2484 F(vals[y]), F(vals[x]))); 2485 break; 2486 case Op::min_f32: 2487 vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])), 2488 F(vals[y]), F(vals[x]))); 2489 break; 2490 2491 case Op::sqrt_f32: 2492 vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x]))); 2493 break; 2494 2495 case Op::to_f32: vals[i] = I(b->CreateSIToFP( vals[x] , F32)); break; 2496 case Op::trunc : vals[i] = b->CreateFPToSI(F(vals[x]), I32) ; break; 2497 case Op::round : { 2498 // Basic impl when we can't use cvtps2dq and co. 2499 auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x])); 2500 vals[i] = b->CreateFPToSI(round, I32); 2501 2502 #if 1 && defined(SK_CPU_X86) 2503 // Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling. 2504 if (scalar) { 2505 // cvtss2si is float x4 -> int, ignoring input lanes 1,2,3. ¯\_(ツ)_/¯ 2506 llvm::Value* v = llvm::UndefValue::get(llvm::VectorType::get(f32, 4)); 2507 v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)0); 2508 vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v}); 2509 } else { 2510 SkASSERT(K == 4 || K == 8); 2511 auto intr = K == 4 ? llvm::Intrinsic::x86_sse2_cvtps2dq : 2512 /* K == 8 ?*/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256; 2513 vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])}); 2514 } 2515 #endif 2516 } break; 2517 2518 case Op::add_i16x2: vals[i] = I(b->CreateAdd(x2(vals[x]), x2(vals[y]))); break; 2519 case Op::sub_i16x2: vals[i] = I(b->CreateSub(x2(vals[x]), x2(vals[y]))); break; 2520 case Op::mul_i16x2: vals[i] = I(b->CreateMul(x2(vals[x]), x2(vals[y]))); break; 2521 2522 case Op::shl_i16x2: vals[i] = I(b->CreateShl (x2(vals[x]), immy)); break; 2523 case Op::sra_i16x2: vals[i] = I(b->CreateAShr(x2(vals[x]), immy)); break; 2524 case Op::shr_i16x2: vals[i] = I(b->CreateLShr(x2(vals[x]), immy)); break; 2525 2526 case Op:: eq_i16x2: 2527 vals[i] = I(S(I16x2, b->CreateICmpEQ (x2(vals[x]), x2(vals[y])))); 2528 break; 2529 case Op::neq_i16x2: 2530 vals[i] = I(S(I16x2, b->CreateICmpNE (x2(vals[x]), x2(vals[y])))); 2531 break; 2532 case Op:: gt_i16x2: 2533 vals[i] = I(S(I16x2, b->CreateICmpSGT(x2(vals[x]), x2(vals[y])))); 2534 break; 2535 case Op::gte_i16x2: 2536 vals[i] = I(S(I16x2, b->CreateICmpSGE(x2(vals[x]), x2(vals[y])))); 2537 break; 2538 2539 case Op::bytes: { 2540 int N = vals[x]->getType()->isVectorTy() ? K : 1; 2541 2542 uint32_t off = 0; 2543 auto nibble_to_mask = [&](uint8_t n) -> uint32_t { 2544 switch (n) { 2545 case 0: return 4*N; // Select any byte in the second (zero) arg. 2546 case 1: return off + 0; // 1st byte in this arg. 2547 case 2: return off + 1; // 2nd ... 2548 case 3: return off + 2; // 3rd ... 2549 case 4: return off + 3; // 4th byte in this arg. 2550 } 2551 SkUNREACHABLE; 2552 return 0; 2553 }; 2554 2555 std::vector<uint32_t> mask(N*4); 2556 for (int i = 0; i < N; i++) { 2557 mask[4*i+0] = nibble_to_mask( (immy >> 0) & 0xf ); 2558 mask[4*i+1] = nibble_to_mask( (immy >> 4) & 0xf ); 2559 mask[4*i+2] = nibble_to_mask( (immy >> 8) & 0xf ); 2560 mask[4*i+3] = nibble_to_mask( (immy >> 12) & 0xf ); 2561 off += 4; 2562 } 2563 2564 llvm::Value* input = b->CreateBitCast(vals[x], I8x4); 2565 llvm::Value* zero = llvm::Constant::getNullValue(I8x4); 2566 vals[i] = I(b->CreateShuffleVector(input, zero, mask)); 2567 } break; 2568 } 2569 return true; 2570 }; 2571 2572 { 2573 IRBuilder b(enter); 2574 b.CreateBr(hoistK); 2575 } 2576 2577 // hoistK: emit each hoistable vector instruction; goto testK; 2578 // LLVM can do this sort of thing itself, but we've got the information cheap, 2579 // and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe. 2580 { 2581 IRBuilder b(hoistK); 2582 2583 // Hoisted instructions will need args (think, uniforms), so set that up now. 2584 // These phi nodes are degenerate... they'll always be the passed-in args from enter. 2585 // Later on when we start looping the phi nodes will start looking useful. 2586 llvm::Argument* arg = fn->arg_begin(); 2587 (void)arg++; // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction. 2588 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2589 args.push_back(b.CreatePHI(arg->getType(), 1)); 2590 args.back()->addIncoming(arg++, enter); 2591 } 2592 2593 for (size_t i = 0; i < instructions.size(); i++) { 2594 if (instructions[i].can_hoist && !emit(i, false, &b)) { 2595 return; 2596 } 2597 } 2598 2599 b.CreateBr(testK); 2600 } 2601 2602 // testK: if (N >= K) goto loopK; else goto hoist1; 2603 { 2604 IRBuilder b(testK); 2605 2606 // New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK. 2607 // These also start as the initial function arguments; hoistK can't have changed them. 2608 llvm::Argument* arg = fn->arg_begin(); 2609 2610 n = b.CreatePHI(arg->getType(), 2); 2611 n->addIncoming(arg++, hoistK); 2612 2613 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2614 args[i] = b.CreatePHI(arg->getType(), 2); 2615 args[i]->addIncoming(arg++, hoistK); 2616 } 2617 2618 b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1); 2619 } 2620 2621 // loopK: ... insts on K x T vectors; N -= K, args += K*stride; goto testK; 2622 { 2623 IRBuilder b(loopK); 2624 for (size_t i = 0; i < instructions.size(); i++) { 2625 if (!instructions[i].can_hoist && !emit(i, false, &b)) { 2626 return; 2627 } 2628 } 2629 2630 // n -= K 2631 llvm::Value* n_next = b.CreateSub(n, b.getInt32(K)); 2632 n->addIncoming(n_next, loopK); 2633 2634 // Each arg ptr += K 2635 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2636 llvm::Value* arg_next 2637 = b.CreateConstInBoundsGEP1_32(nullptr, args[i], K*fImpl->strides[i]); 2638 args[i]->addIncoming(arg_next, loopK); 2639 } 2640 b.CreateBr(testK); 2641 } 2642 2643 // hoist1: emit each hoistable scalar instruction; goto test1; 2644 { 2645 IRBuilder b(hoist1); 2646 for (size_t i = 0; i < instructions.size(); i++) { 2647 if (instructions[i].can_hoist && !emit(i, true, &b)) { 2648 return; 2649 } 2650 } 2651 b.CreateBr(test1); 2652 } 2653 2654 // test1: if (N >= 1) goto loop1; else goto leave; 2655 { 2656 IRBuilder b(test1); 2657 2658 // Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1. 2659 llvm::PHINode* n_new = b.CreatePHI(n->getType(), 2); 2660 n_new->addIncoming(n, hoist1); 2661 n = n_new; 2662 2663 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2664 llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), 2); 2665 arg_new->addIncoming(args[i], hoist1); 2666 args[i] = arg_new; 2667 } 2668 2669 b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(1)), loop1, leave); 2670 } 2671 2672 // loop1: ... insts on scalars; N -= 1, args += stride; goto test1; 2673 { 2674 IRBuilder b(loop1); 2675 for (size_t i = 0; i < instructions.size(); i++) { 2676 if (!instructions[i].can_hoist && !emit(i, true, &b)) { 2677 return; 2678 } 2679 } 2680 2681 // n -= 1 2682 llvm::Value* n_next = b.CreateSub(n, b.getInt32(1)); 2683 n->addIncoming(n_next, loop1); 2684 2685 // Each arg ptr += K 2686 for (size_t i = 0; i < fImpl->strides.size(); i++) { 2687 llvm::Value* arg_next 2688 = b.CreateConstInBoundsGEP1_32(nullptr, args[i], fImpl->strides[i]); 2689 args[i]->addIncoming(arg_next, loop1); 2690 } 2691 b.CreateBr(test1); 2692 } 2693 2694 // leave: ret 2695 { 2696 IRBuilder b(leave); 2697 b.CreateRetVoid(); 2698 } 2699 2700 SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs())); 2701 2702 if (true) { 2703 SkString path = SkStringPrintf("/tmp/%s.bc", debug_name); 2704 std::error_code err; 2705 llvm::raw_fd_ostream os(path.c_str(), err); 2706 if (err) { 2707 return; 2708 } 2709 llvm::WriteBitcodeToFile(*mod, os); 2710 } 2711 2712 static SkOnce once; 2713 once([]{ 2714 SkAssertResult(false == llvm::InitializeNativeTarget()); 2715 SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter()); 2716 }); 2717 2718 if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod)) 2719 .setEngineKind(llvm::EngineKind::JIT) 2720 .setMCPU(llvm::sys::getHostCPUName()) 2721 .create()) { 2722 fImpl->llvm_ctx = std::move(ctx); 2723 fImpl->llvm_ee.reset(ee); 2724 2725 // We have to be careful here about what we close over and how, in case fImpl moves. 2726 // fImpl itself may change, but its pointee fields won't, so close over them by value. 2727 // Also, debug_name will almost certainly leave scope, so copy it. 2728 fImpl->llvm_compiling = std::async(std::launch::async, [dst = &fImpl->jit_entry, 2729 ee = fImpl->llvm_ee.get(), 2730 name = std::string(debug_name)]{ 2731 // std::atomic<void*>* dst; 2732 // llvm::ExecutionEngine* ee; 2733 // std::string name; 2734 dst->store( (void*)ee->getFunctionAddress(name.c_str()) ); 2735 }); 2736 } 2737 } 2738 #endif 2739 waitForLLVM() const2740 void Program::waitForLLVM() const { 2741 #if defined(SKVM_LLVM) 2742 if (fImpl->llvm_compiling.valid()) { 2743 fImpl->llvm_compiling.wait(); 2744 } 2745 #endif 2746 } 2747 hasJIT() const2748 bool Program::hasJIT() const { 2749 // Program::hasJIT() is really just a debugging / test aid, 2750 // so we don't mind adding a sync point here to wait for compilation. 2751 this->waitForLLVM(); 2752 2753 return fImpl->jit_entry.load() != nullptr; 2754 } 2755 dropJIT()2756 void Program::dropJIT() { 2757 #if defined(SKVM_LLVM) 2758 this->waitForLLVM(); 2759 fImpl->llvm_ee .reset(nullptr); 2760 fImpl->llvm_ctx.reset(nullptr); 2761 #elif defined(SKVM_JIT) 2762 if (fImpl->dylib) { 2763 dlclose(fImpl->dylib); 2764 } else if (auto jit_entry = fImpl->jit_entry.load()) { 2765 munmap(jit_entry, fImpl->jit_size); 2766 } 2767 #else 2768 SkASSERT(!this->hasJIT()); 2769 #endif 2770 2771 fImpl->jit_entry.store(nullptr); 2772 fImpl->jit_size = 0; 2773 fImpl->dylib = nullptr; 2774 } 2775 Program()2776 Program::Program() : fImpl(std::make_unique<Impl>()) {} 2777 ~Program()2778 Program::~Program() { 2779 // Moved-from Programs may have fImpl == nullptr. 2780 if (fImpl) { 2781 this->dropJIT(); 2782 } 2783 } 2784 Program(Program && other)2785 Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {} 2786 operator =(Program && other)2787 Program& Program::operator=(Program&& other) { 2788 fImpl = std::move(other.fImpl); 2789 return *this; 2790 } 2791 Program(const std::vector<OptimizedInstruction> & interpreter,const std::vector<int> & strides)2792 Program::Program(const std::vector<OptimizedInstruction>& interpreter, 2793 const std::vector<int>& strides) : Program() { 2794 fImpl->strides = strides; 2795 this->setupInterpreter(interpreter); 2796 } 2797 Program(const std::vector<OptimizedInstruction> & interpreter,const std::vector<OptimizedInstruction> & jit,const std::vector<int> & strides,const char * debug_name)2798 Program::Program(const std::vector<OptimizedInstruction>& interpreter, 2799 const std::vector<OptimizedInstruction>& jit, 2800 const std::vector<int>& strides, 2801 const char* debug_name) : Program() { 2802 fImpl->strides = strides; 2803 #if 1 && defined(SKVM_LLVM) 2804 this->setupLLVM(interpreter, debug_name); 2805 #elif 1 && defined(SKVM_JIT) 2806 this->setupJIT(jit, debug_name); 2807 #endif 2808 2809 // Might as well do this after setupLLVM() to get a little more time to compile. 2810 this->setupInterpreter(interpreter); 2811 } 2812 instructions() const2813 std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; } nargs() const2814 int Program::nargs() const { return (int)fImpl->strides.size(); } nregs() const2815 int Program::nregs() const { return fImpl->regs; } loop() const2816 int Program::loop () const { return fImpl->loop; } empty() const2817 bool Program::empty() const { return fImpl->instructions.empty(); } 2818 2819 // Translate OptimizedInstructions to InterpreterInstructions. setupInterpreter(const std::vector<OptimizedInstruction> & instructions)2820 void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) { 2821 // Register each instruction is assigned to. 2822 std::vector<Reg> reg(instructions.size()); 2823 2824 // This next bit is a bit more complicated than strictly necessary; 2825 // we could just assign every instruction to its own register. 2826 // 2827 // But recycling registers is fairly cheap, and good practice for the 2828 // JITs where minimizing register pressure really is important. 2829 // 2830 // Since we have effectively infinite registers, we hoist any value we can. 2831 // (The JIT may choose a more complex policy to reduce register pressure.) 2832 auto hoisted = [&](Val id) { return instructions[id].can_hoist; }; 2833 2834 fImpl->regs = 0; 2835 std::vector<Reg> avail; 2836 2837 // Assign this value to a register, recycling them where we can. 2838 auto assign_register = [&](Val id) { 2839 const OptimizedInstruction& inst = instructions[id]; 2840 2841 // If this is a real input and it's lifetime ends at this instruction, 2842 // we can recycle the register it's occupying. 2843 auto maybe_recycle_register = [&](Val input) { 2844 if (input != NA 2845 && instructions[input].death == id 2846 && !(hoisted(input) && instructions[input].used_in_loop)) { 2847 avail.push_back(reg[input]); 2848 } 2849 }; 2850 2851 // Take care to not recycle the same register twice. 2852 if (true ) { maybe_recycle_register(inst.x); } 2853 if (inst.y != inst.x ) { maybe_recycle_register(inst.y); } 2854 if (inst.z != inst.x && inst.z != inst.y) { maybe_recycle_register(inst.z); } 2855 2856 // Instructions that die at themselves (stores) don't need a register. 2857 if (inst.death != id) { 2858 // Allocate a register if we have to, preferring to reuse anything available. 2859 if (avail.empty()) { 2860 reg[id] = fImpl->regs++; 2861 } else { 2862 reg[id] = avail.back(); 2863 avail.pop_back(); 2864 } 2865 } 2866 }; 2867 2868 // Assign a register to each hoisted instruction, then each non-hoisted loop instruction. 2869 for (Val id = 0; id < (Val)instructions.size(); id++) { 2870 if ( hoisted(id)) { assign_register(id); } 2871 } 2872 for (Val id = 0; id < (Val)instructions.size(); id++) { 2873 if (!hoisted(id)) { assign_register(id); } 2874 } 2875 2876 // Translate OptimizedInstructions to InterpreterIstructions by mapping values to 2877 // registers. This will be two passes, first hoisted instructions, then inside the loop. 2878 2879 // The loop begins at the fImpl->loop'th Instruction. 2880 fImpl->loop = 0; 2881 fImpl->instructions.reserve(instructions.size()); 2882 2883 // Add a dummy mapping for the N/A sentinel Val to any arbitrary register 2884 // so lookups don't have to know which arguments are used by which Ops. 2885 auto lookup_register = [&](Val id) { 2886 return id == NA ? (Reg)0 2887 : reg[id]; 2888 }; 2889 2890 auto push_instruction = [&](Val id, const OptimizedInstruction& inst) { 2891 InterpreterInstruction pinst{ 2892 inst.op, 2893 lookup_register(id), 2894 lookup_register(inst.x), 2895 {lookup_register(inst.y)}, 2896 {lookup_register(inst.z)}, 2897 }; 2898 if (inst.y == NA) { pinst.immy = inst.immy; } 2899 if (inst.z == NA) { pinst.immz = inst.immz; } 2900 fImpl->instructions.push_back(pinst); 2901 }; 2902 2903 for (Val id = 0; id < (Val)instructions.size(); id++) { 2904 const OptimizedInstruction& inst = instructions[id]; 2905 if (hoisted(id)) { 2906 push_instruction(id, inst); 2907 fImpl->loop++; 2908 } 2909 } 2910 for (Val id = 0; id < (Val)instructions.size(); id++) { 2911 const OptimizedInstruction& inst = instructions[id]; 2912 if (!hoisted(id)) { 2913 push_instruction(id, inst); 2914 } 2915 } 2916 } 2917 2918 #if defined(SKVM_JIT) 2919 2920 // Just so happens that we can translate the immediate control for our bytes() op 2921 // to a single 128-bit mask that can be consumed by both AVX2 vpshufb and NEON tbl! bytes_control(int imm,int mask[4])2922 static void bytes_control(int imm, int mask[4]) { 2923 auto nibble_to_vpshufb = [](uint8_t n) -> uint8_t { 2924 // 0 -> 0xff, Fill with zero 2925 // 1 -> 0x00, Select byte 0 2926 // 2 -> 0x01, " 1 2927 // 3 -> 0x02, " 2 2928 // 4 -> 0x03, " 3 2929 return n - 1; 2930 }; 2931 uint8_t control[] = { 2932 nibble_to_vpshufb( (imm >> 0) & 0xf ), 2933 nibble_to_vpshufb( (imm >> 4) & 0xf ), 2934 nibble_to_vpshufb( (imm >> 8) & 0xf ), 2935 nibble_to_vpshufb( (imm >> 12) & 0xf ), 2936 }; 2937 for (int i = 0; i < 4; i++) { 2938 mask[i] = (int)control[0] << 0 2939 | (int)control[1] << 8 2940 | (int)control[2] << 16 2941 | (int)control[3] << 24; 2942 2943 // Update each byte that refers to a byte index by 4 to 2944 // point into the next 32-bit lane, but leave any 0xff 2945 // that fills with zero alone. 2946 control[0] += control[0] == 0xff ? 0 : 4; 2947 control[1] += control[1] == 0xff ? 0 : 4; 2948 control[2] += control[2] == 0xff ? 0 : 4; 2949 control[3] += control[3] == 0xff ? 0 : 4; 2950 } 2951 } 2952 jit(const std::vector<OptimizedInstruction> & instructions,const bool try_hoisting,Assembler * a) const2953 bool Program::jit(const std::vector<OptimizedInstruction>& instructions, 2954 const bool try_hoisting, 2955 Assembler* a) const { 2956 using A = Assembler; 2957 2958 auto debug_dump = [&] { 2959 #if 0 2960 SkDebugfStream stream; 2961 this->dump(&stream); 2962 return true; 2963 #else 2964 return false; 2965 #endif 2966 }; 2967 2968 #if defined(__x86_64__) 2969 if (!SkCpu::Supports(SkCpu::HSW)) { 2970 return false; 2971 } 2972 A::GP64 N = A::rdi, 2973 scratch = A::rax, 2974 scratch2 = A::r11, 2975 arg[] = { A::rsi, A::rdx, A::rcx, A::r8, A::r9 }; 2976 2977 // All 16 ymm registers are available to use. 2978 using Reg = A::Ymm; 2979 uint32_t avail = 0xffff; 2980 2981 #elif defined(__aarch64__) 2982 A::X N = A::x0, 2983 scratch = A::x8, 2984 arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 }; 2985 2986 // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15. 2987 using Reg = A::V; 2988 uint32_t avail = 0xffff00ff; 2989 #endif 2990 2991 if (SK_ARRAY_COUNT(arg) < fImpl->strides.size()) { 2992 return false; 2993 } 2994 2995 auto hoisted = [&](Val id) { return try_hoisting && instructions[id].can_hoist; }; 2996 2997 std::vector<Reg> r(instructions.size()); 2998 2999 struct LabelAndReg { 3000 A::Label label; 3001 Reg reg; 3002 }; 3003 SkTHashMap<int, LabelAndReg> constants, // All constants share the same pool. 3004 bytes_masks; // These vary per-lane. 3005 LabelAndReg iota; // Exists _only_ to vary per-lane. 3006 3007 auto warmup = [&](Val id) { 3008 const OptimizedInstruction& inst = instructions[id]; 3009 3010 switch (inst.op) { 3011 default: break; 3012 3013 case Op::bytes: if (!bytes_masks.find(inst.immy)) { 3014 bytes_masks.set(inst.immy, {}); 3015 if (try_hoisting) { 3016 // vpshufb can always work with the mask from memory, 3017 // but it helps to hoist the mask to a register for tbl. 3018 #if defined(__aarch64__) 3019 LabelAndReg* entry = bytes_masks.find(inst.immy); 3020 if (int found = __builtin_ffs(avail)) { 3021 entry->reg = (Reg)(found-1); 3022 avail ^= 1 << entry->reg; 3023 a->ldrq(entry->reg, &entry->label); 3024 } else { 3025 return false; 3026 } 3027 #endif 3028 } 3029 } 3030 break; 3031 } 3032 return true; 3033 }; 3034 3035 auto emit = [&](Val id, bool scalar) { 3036 const OptimizedInstruction& inst = instructions[id]; 3037 3038 Op op = inst.op; 3039 Val x = inst.x, 3040 y = inst.y, 3041 z = inst.z; 3042 int immy = inst.immy, 3043 immz = inst.immz; 3044 3045 // Most (but not all) ops create an output value and need a register to hold it, dst. 3046 // We track each instruction's dst in r[] so we can thread it through as an input 3047 // to any future instructions needing that value. 3048 // 3049 // And some ops may need a temporary register, tmp. Some need both tmp and dst. 3050 // 3051 // tmp and dst are very similar and can and will often be assigned the same register, 3052 // but tmp may never alias any of the instructions's inputs, while dst may when this 3053 // instruction consumes that input, i.e. if the input reaches its end of life here. 3054 // 3055 // We'll assign both registers lazily to keep register pressure as low as possible. 3056 bool tmp_is_set = false, 3057 dst_is_set = false; 3058 Reg tmp_reg = (Reg)0; // This initial value won't matter... anything legal is fine. 3059 3060 bool ok = true; // Set to false if we need to assign a register and none's available. 3061 3062 // First lock in how to choose tmp if we need to based on the registers 3063 // available before this instruction, not including any of its input registers. 3064 auto tmp = [&,avail/*important, closing over avail's current value*/]{ 3065 if (!tmp_is_set) { 3066 tmp_is_set = true; 3067 if (int found = __builtin_ffs(avail)) { 3068 // This is a temporary register just for this op, 3069 // so we leave it marked available for future ops. 3070 tmp_reg = (Reg)(found - 1); 3071 } else { 3072 // We needed a tmp register but couldn't find one available. :'( 3073 // This will cause emit() to return false, in turn causing jit() to fail. 3074 if (debug_dump()) { 3075 SkDebugf("\nCould not find a register to hold tmp\n"); 3076 } 3077 ok = false; 3078 } 3079 } 3080 return tmp_reg; 3081 }; 3082 3083 // Now make available any registers that are consumed by this instruction. 3084 // (The register pool we can pick dst from is >= the pool for tmp, adding any of these.) 3085 auto maybe_recycle_register = [&](Val input) { 3086 if (input != NA 3087 && instructions[input].death == id 3088 && !(hoisted(input) && instructions[input].used_in_loop)) { 3089 avail |= 1 << r[input]; 3090 } 3091 }; 3092 maybe_recycle_register(x); 3093 maybe_recycle_register(y); 3094 maybe_recycle_register(z); 3095 // set_dst() and dst() will work read/write with this perhaps-just-updated avail. 3096 3097 // Some ops may decide dst on their own to best fit the instruction (see Op::fma_f32). 3098 auto set_dst = [&](Reg reg){ 3099 SkASSERT(dst_is_set == false); 3100 dst_is_set = true; 3101 3102 SkASSERT(avail & (1<<reg)); 3103 avail ^= 1<<reg; 3104 3105 r[id] = reg; 3106 }; 3107 3108 // Thanks to AVX and NEON's 3-argument instruction sets, 3109 // most ops can use any register as dst. 3110 auto dst = [&]{ 3111 if (!dst_is_set) { 3112 if (int found = __builtin_ffs(avail)) { 3113 set_dst((Reg)(found-1)); 3114 } else { 3115 // Same deal as with tmp... all the registers are occupied. Time to fail! 3116 if (debug_dump()) { 3117 SkDebugf("\nCould not find a register to hold value %d\n", id); 3118 } 3119 ok = false; 3120 } 3121 } 3122 return r[id]; 3123 }; 3124 3125 // Because we use the same logic to pick an arbitrary dst and to pick tmp, 3126 // and we know that tmp will never overlap any of the inputs, `dst() == tmp()` 3127 // is a simple idiom to check that the destination does not overlap any of the inputs. 3128 // Sometimes we can use this knowledge to do better instruction selection. 3129 3130 // Ok! Keep in mind that we haven't assigned tmp or dst yet, 3131 // just laid out hooks for how to do so if we need them, depending on the instruction. 3132 // 3133 // Now let's actually assemble the instruction! 3134 switch (op) { 3135 default: 3136 if (debug_dump()) { 3137 SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op); 3138 } 3139 return false; // TODO: many new ops 3140 3141 #if defined(__x86_64__) 3142 case Op::assert_true: { 3143 a->vptest (r[x], &constants[0xffffffff].label); 3144 A::Label all_true; 3145 a->jc(&all_true); 3146 a->int3(); 3147 a->label(&all_true); 3148 } break; 3149 3150 case Op::store8: if (scalar) { a->vpextrb (arg[immy], (A::Xmm)r[x], 0); } 3151 else { a->vpackusdw(tmp(), r[x], r[x]); 3152 a->vpermq (tmp(), tmp(), 0xd8); 3153 a->vpackuswb(tmp(), tmp(), tmp()); 3154 a->vmovq (arg[immy], (A::Xmm)tmp()); } 3155 break; 3156 3157 case Op::store16: if (scalar) { a->vpextrw (arg[immy], (A::Xmm)r[x], 0); } 3158 else { a->vpackusdw(tmp(), r[x], r[x]); 3159 a->vpermq (tmp(), tmp(), 0xd8); 3160 a->vmovups (arg[immy], (A::Xmm)tmp()); } 3161 break; 3162 3163 case Op::store32: if (scalar) { a->vmovd (arg[immy], (A::Xmm)r[x]); } 3164 else { a->vmovups(arg[immy], r[x]); } 3165 break; 3166 3167 case Op::load8: if (scalar) { 3168 a->vpxor (dst(), dst(), dst()); 3169 a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), arg[immy], 0); 3170 } else { 3171 a->vpmovzxbd(dst(), arg[immy]); 3172 } break; 3173 3174 case Op::load16: if (scalar) { 3175 a->vpxor (dst(), dst(), dst()); 3176 a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), arg[immy], 0); 3177 } else { 3178 a->vpmovzxwd(dst(), arg[immy]); 3179 } break; 3180 3181 case Op::load32: if (scalar) { a->vmovd ((A::Xmm)dst(), arg[immy]); } 3182 else { a->vmovups( dst(), arg[immy]); } 3183 break; 3184 3185 case Op::gather32: 3186 if (scalar) { 3187 auto base = scratch, 3188 index = scratch2; 3189 // Our gather base pointer is immz bytes off of uniform immy. 3190 a->movq(base, arg[immy], immz); 3191 3192 // Grab our index from lane 0 of the index argument. 3193 a->vmovd_direct(index, (A::Xmm)r[x]); 3194 3195 // dst = *(base + 4*index) 3196 a->vmovd((A::Xmm)dst(), A::FOUR, index, base); 3197 } else { 3198 // We may not let any of dst(), index, or mask use the same register, 3199 // so we must allocate registers manually and very carefully. 3200 3201 // index is argument x and has already been maybe_recycle_register()'d, 3202 // so we explicitly ignore its availability during this op. 3203 A::Ymm index = r[x]; 3204 uint32_t avail_during_gather = avail & ~(1<<index); 3205 3206 // Choose dst() to not overlap with index. 3207 if (int found = __builtin_ffs(avail_during_gather)) { 3208 set_dst((A::Ymm)(found-1)); 3209 avail_during_gather ^= (1<<dst()); 3210 } else { 3211 ok = false; 3212 break; 3213 } 3214 3215 // Choose (temporary) mask to not overlap with dst() or index. 3216 A::Ymm mask; 3217 if (int found = __builtin_ffs(avail_during_gather)) { 3218 mask = (A::Ymm)(found-1); 3219 } else { 3220 ok = false; 3221 break; 3222 } 3223 3224 // Our gather base pointer is immz bytes off of uniform immy. 3225 auto base = scratch; 3226 a->movq(base, arg[immy], immz); 3227 a->vpcmpeqd(mask, mask, mask); // (All lanes enabled.) 3228 a->vgatherdps(dst(), A::FOUR, index, base, mask); 3229 } 3230 break; 3231 3232 case Op::uniform8: a->movzbl(scratch, arg[immy], immz); 3233 a->vmovd_direct((A::Xmm)dst(), scratch); 3234 a->vbroadcastss(dst(), (A::Xmm)dst()); 3235 break; 3236 3237 case Op::uniform32: a->vbroadcastss(dst(), arg[immy], immz); 3238 break; 3239 3240 case Op::index: a->vmovd_direct((A::Xmm)tmp(), N); 3241 a->vbroadcastss(tmp(), (A::Xmm)tmp()); 3242 a->vpsubd(dst(), tmp(), &iota.label); 3243 break; 3244 3245 case Op::splat: if (immy) { a->vbroadcastss(dst(), &constants[immy].label); } 3246 else { a->vpxor(dst(), dst(), dst()); } 3247 break; 3248 3249 case Op::add_f32: a->vaddps(dst(), r[x], r[y]); break; 3250 case Op::sub_f32: a->vsubps(dst(), r[x], r[y]); break; 3251 case Op::mul_f32: a->vmulps(dst(), r[x], r[y]); break; 3252 case Op::div_f32: a->vdivps(dst(), r[x], r[y]); break; 3253 case Op::min_f32: a->vminps(dst(), r[x], r[y]); break; 3254 case Op::max_f32: a->vmaxps(dst(), r[x], r[y]); break; 3255 3256 case Op::fma_f32: 3257 if (avail & (1<<r[x])) { set_dst(r[x]); a->vfmadd132ps(r[x], r[z], r[y]); } 3258 else if (avail & (1<<r[y])) { set_dst(r[y]); a->vfmadd213ps(r[y], r[x], r[z]); } 3259 else if (avail & (1<<r[z])) { set_dst(r[z]); a->vfmadd231ps(r[z], r[x], r[y]); } 3260 else { SkASSERT(dst() == tmp()); 3261 a->vmovdqa (dst(),r[x]); 3262 a->vfmadd132ps(dst(),r[z], r[y]); } 3263 break; 3264 3265 case Op::fms_f32: 3266 if (avail & (1<<r[x])) { set_dst(r[x]); a->vfmsub132ps(r[x], r[z], r[y]); } 3267 else if (avail & (1<<r[y])) { set_dst(r[y]); a->vfmsub213ps(r[y], r[x], r[z]); } 3268 else if (avail & (1<<r[z])) { set_dst(r[z]); a->vfmsub231ps(r[z], r[x], r[y]); } 3269 else { SkASSERT(dst() == tmp()); 3270 a->vmovdqa (dst(),r[x]); 3271 a->vfmsub132ps(dst(),r[z], r[y]); } 3272 break; 3273 3274 case Op::fnma_f32: 3275 if (avail & (1<<r[x])) { set_dst(r[x]); a->vfnmadd132ps(r[x],r[z], r[y]); } 3276 else if (avail & (1<<r[y])) { set_dst(r[y]); a->vfnmadd213ps(r[y],r[x], r[z]); } 3277 else if (avail & (1<<r[z])) { set_dst(r[z]); a->vfnmadd231ps(r[z],r[x], r[y]); } 3278 else { SkASSERT(dst() == tmp()); 3279 a->vmovdqa (dst(),r[x]); 3280 a->vfnmadd132ps(dst(),r[z],r[y]); } 3281 break; 3282 3283 case Op::sqrt_f32: a->vsqrtps(dst(), r[x]); break; 3284 3285 case Op::add_f32_imm: a->vaddps(dst(), r[x], &constants[immy].label); break; 3286 case Op::sub_f32_imm: a->vsubps(dst(), r[x], &constants[immy].label); break; 3287 case Op::mul_f32_imm: a->vmulps(dst(), r[x], &constants[immy].label); break; 3288 case Op::min_f32_imm: a->vminps(dst(), r[x], &constants[immy].label); break; 3289 case Op::max_f32_imm: a->vmaxps(dst(), r[x], &constants[immy].label); break; 3290 3291 case Op::add_i32: a->vpaddd (dst(), r[x], r[y]); break; 3292 case Op::sub_i32: a->vpsubd (dst(), r[x], r[y]); break; 3293 case Op::mul_i32: a->vpmulld(dst(), r[x], r[y]); break; 3294 3295 case Op::sub_i16x2: a->vpsubw (dst(), r[x], r[y]); break; 3296 case Op::mul_i16x2: a->vpmullw(dst(), r[x], r[y]); break; 3297 case Op::shr_i16x2: a->vpsrlw (dst(), r[x], immy); break; 3298 3299 case Op::bit_and : a->vpand (dst(), r[x], r[y]); break; 3300 case Op::bit_or : a->vpor (dst(), r[x], r[y]); break; 3301 case Op::bit_xor : a->vpxor (dst(), r[x], r[y]); break; 3302 case Op::bit_clear: a->vpandn(dst(), r[y], r[x]); break; // N.B. Y then X. 3303 case Op::select : a->vpblendvb(dst(), r[z], r[y], r[x]); break; 3304 3305 case Op::bit_and_imm: a->vpand (dst(), r[x], &constants[immy].label); break; 3306 case Op::bit_or_imm : a->vpor (dst(), r[x], &constants[immy].label); break; 3307 case Op::bit_xor_imm: a->vpxor (dst(), r[x], &constants[immy].label); break; 3308 3309 case Op::shl_i32: a->vpslld(dst(), r[x], immy); break; 3310 case Op::shr_i32: a->vpsrld(dst(), r[x], immy); break; 3311 case Op::sra_i32: a->vpsrad(dst(), r[x], immy); break; 3312 3313 case Op::eq_i32: a->vpcmpeqd(dst(), r[x], r[y]); break; 3314 case Op::gt_i32: a->vpcmpgtd(dst(), r[x], r[y]); break; 3315 3316 case Op:: eq_f32: a->vcmpeqps (dst(), r[x], r[y]); break; 3317 case Op::neq_f32: a->vcmpneqps(dst(), r[x], r[y]); break; 3318 case Op:: gt_f32: a->vcmpltps (dst(), r[y], r[x]); break; 3319 case Op::gte_f32: a->vcmpleps (dst(), r[y], r[x]); break; 3320 3321 case Op::pack: a->vpslld(tmp(), r[y], immz); 3322 a->vpor (dst(), tmp(), r[x]); 3323 break; 3324 3325 case Op::floor : a->vroundps (dst(), r[x], Assembler::FLOOR); break; 3326 case Op::to_f32: a->vcvtdq2ps (dst(), r[x]); break; 3327 case Op::trunc : a->vcvttps2dq(dst(), r[x]); break; 3328 case Op::round : a->vcvtps2dq (dst(), r[x]); break; 3329 3330 case Op::bytes: a->vpshufb(dst(), r[x], &bytes_masks.find(immy)->label); 3331 break; 3332 3333 #elif defined(__aarch64__) 3334 case Op::assert_true: { 3335 a->uminv4s(tmp(), r[x]); // uminv acts like an all() across the vector. 3336 a->fmovs(scratch, tmp()); 3337 A::Label all_true; 3338 a->cbnz(scratch, &all_true); 3339 a->brk(0); 3340 a->label(&all_true); 3341 } break; 3342 3343 case Op::store8: a->xtns2h(tmp(), r[x]); 3344 a->xtnh2b(tmp(), tmp()); 3345 if (scalar) { a->strb (tmp(), arg[immy]); } 3346 else { a->strs (tmp(), arg[immy]); } 3347 break; 3348 // TODO: another case where it'd be okay to alias r[x] and tmp if r[x] dies here. 3349 3350 case Op::store32: if (scalar) { a->strs(r[x], arg[immy]); } 3351 else { a->strq(r[x], arg[immy]); } 3352 break; 3353 3354 case Op::load8: if (scalar) { a->ldrb(tmp(), arg[immy]); } 3355 else { a->ldrs(tmp(), arg[immy]); } 3356 a->uxtlb2h(tmp(), tmp()); 3357 a->uxtlh2s(dst(), tmp()); 3358 break; 3359 3360 case Op::load32: if (scalar) { a->ldrs(dst(), arg[immy]); } 3361 else { a->ldrq(dst(), arg[immy]); } 3362 break; 3363 3364 case Op::splat: if (immy) { a->ldrq(dst(), &constants[immy].label); } 3365 else { a->eor16b(dst(), dst(), dst()); } 3366 break; 3367 // TODO: If we hoist these, pack 4 values in each register 3368 // and use vector/lane operations, cutting the register 3369 // pressure cost of hoisting by 4? 3370 3371 case Op::add_f32: a->fadd4s(dst(), r[x], r[y]); break; 3372 case Op::sub_f32: a->fsub4s(dst(), r[x], r[y]); break; 3373 case Op::mul_f32: a->fmul4s(dst(), r[x], r[y]); break; 3374 case Op::div_f32: a->fdiv4s(dst(), r[x], r[y]); break; 3375 case Op::min_f32: a->fmin4s(dst(), r[x], r[y]); break; 3376 case Op::max_f32: a->fmax4s(dst(), r[x], r[y]); break; 3377 3378 case Op::fma_f32: // fmla.4s is z += x*y 3379 if (avail & (1<<r[z])) { set_dst(r[z]); a->fmla4s( r[z], r[x], r[y]); } 3380 else { a->orr16b(tmp(), r[z], r[z]); 3381 a->fmla4s(tmp(), r[x], r[y]); 3382 if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } } 3383 break; 3384 3385 case Op::fnma_f32: // fmls.4s is z -= x*y 3386 if (avail & (1<<r[z])) { set_dst(r[z]); a->fmls4s( r[z], r[x], r[y]); } 3387 else { a->orr16b(tmp(), r[z], r[z]); 3388 a->fmls4s(tmp(), r[x], r[y]); 3389 if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } } 3390 break; 3391 3392 case Op::fms_f32: 3393 // first dst() = xy - z as if fnma_f32 3394 if (avail & (1<<r[z])) { set_dst(r[z]); a->fmls4s( r[z], r[x], r[y]); } 3395 else { a->orr16b(tmp(), r[z], r[z]); 3396 a->fmls4s(tmp(), r[x], r[y]); 3397 if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } } 3398 // then dst() = -dst() (i.e. z - xy) 3399 a->fneg4s(dst(), dst()); 3400 break; 3401 3402 // These _imm instructions are all x86/JIT only. 3403 case Op::add_f32_imm : 3404 case Op::sub_f32_imm : 3405 case Op::mul_f32_imm : 3406 case Op::min_f32_imm : 3407 case Op::max_f32_imm : 3408 case Op::bit_and_imm : 3409 case Op::bit_or_imm : 3410 case Op::bit_xor_imm : SkUNREACHABLE; break; 3411 3412 case Op:: gt_f32: a->fcmgt4s (dst(), r[x], r[y]); break; 3413 case Op::gte_f32: a->fcmge4s (dst(), r[x], r[y]); break; 3414 case Op:: eq_f32: a->fcmeq4s (dst(), r[x], r[y]); break; 3415 case Op::neq_f32: a->fcmeq4s (tmp(), r[x], r[y]); 3416 a->not16b (dst(), tmp()); break; 3417 3418 3419 case Op::add_i32: a->add4s(dst(), r[x], r[y]); break; 3420 case Op::sub_i32: a->sub4s(dst(), r[x], r[y]); break; 3421 case Op::mul_i32: a->mul4s(dst(), r[x], r[y]); break; 3422 3423 case Op::sub_i16x2: a->sub8h (dst(), r[x], r[y]); break; 3424 case Op::mul_i16x2: a->mul8h (dst(), r[x], r[y]); break; 3425 case Op::shr_i16x2: a->ushr8h(dst(), r[x], immy); break; 3426 3427 case Op::bit_and : a->and16b(dst(), r[x], r[y]); break; 3428 case Op::bit_or : a->orr16b(dst(), r[x], r[y]); break; 3429 case Op::bit_xor : a->eor16b(dst(), r[x], r[y]); break; 3430 case Op::bit_clear: a->bic16b(dst(), r[x], r[y]); break; 3431 3432 case Op::select: // bsl16b is x = x ? y : z 3433 if (avail & (1<<r[x])) { set_dst(r[x]); a->bsl16b( r[x], r[y], r[z]); } 3434 else { a->orr16b(tmp(), r[x], r[x]); 3435 a->bsl16b(tmp(), r[y], r[z]); 3436 if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } } 3437 break; 3438 3439 case Op::shl_i32: a-> shl4s(dst(), r[x], immy); break; 3440 case Op::shr_i32: a->ushr4s(dst(), r[x], immy); break; 3441 case Op::sra_i32: a->sshr4s(dst(), r[x], immy); break; 3442 3443 case Op::eq_i32: a->cmeq4s(dst(), r[x], r[y]); break; 3444 case Op::gt_i32: a->cmgt4s(dst(), r[x], r[y]); break; 3445 3446 case Op::pack: 3447 if (avail & (1<<r[x])) { set_dst(r[x]); a->sli4s ( r[x], r[y], immz); } 3448 else { a->shl4s (tmp(), r[y], immz); 3449 a->orr16b(dst(), tmp(), r[x]); } 3450 break; 3451 3452 case Op::to_f32: a->scvtf4s (dst(), r[x]); break; 3453 case Op::trunc: a->fcvtzs4s(dst(), r[x]); break; 3454 case Op::round: a->fcvtns4s(dst(), r[x]); break; 3455 // TODO: fcvtns.4s rounds to nearest even. 3456 // I think we actually want frintx -> fcvtzs to round to current mode. 3457 3458 case Op::bytes: 3459 if (try_hoisting) { a->tbl (dst(), r[x], bytes_masks.find(immy)->reg); } 3460 else { a->ldrq(tmp(), &bytes_masks.find(immy)->label); 3461 a->tbl (dst(), r[x], tmp()); } 3462 break; 3463 #endif 3464 } 3465 3466 // Calls to tmp() or dst() might have flipped this false from its default true state. 3467 return ok; 3468 }; 3469 3470 3471 #if defined(__x86_64__) 3472 const int K = 8; 3473 auto jump_if_less = [&](A::Label* l) { a->jl (l); }; 3474 auto jump = [&](A::Label* l) { a->jmp(l); }; 3475 3476 auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); }; 3477 auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); }; 3478 3479 auto exit = [&]{ a->vzeroupper(); a->ret(); }; 3480 #elif defined(__aarch64__) 3481 const int K = 4; 3482 auto jump_if_less = [&](A::Label* l) { a->blt(l); }; 3483 auto jump = [&](A::Label* l) { a->b (l); }; 3484 3485 auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); }; 3486 auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); }; 3487 3488 auto exit = [&]{ a->ret(A::x30); }; 3489 #endif 3490 3491 A::Label body, 3492 tail, 3493 done; 3494 3495 for (Val id = 0; id < (Val)instructions.size(); id++) { 3496 if (!warmup(id)) { 3497 return false; 3498 } 3499 if (hoisted(id) && !emit(id, /*scalar=*/false)) { 3500 return false; 3501 } 3502 } 3503 3504 a->label(&body); 3505 { 3506 a->cmp(N, K); 3507 jump_if_less(&tail); 3508 for (Val id = 0; id < (Val)instructions.size(); id++) { 3509 if (!hoisted(id) && !emit(id, /*scalar=*/false)) { 3510 return false; 3511 } 3512 } 3513 for (int i = 0; i < (int)fImpl->strides.size(); i++) { 3514 if (fImpl->strides[i]) { 3515 add(arg[i], K*fImpl->strides[i]); 3516 } 3517 } 3518 sub(N, K); 3519 jump(&body); 3520 } 3521 3522 a->label(&tail); 3523 { 3524 a->cmp(N, 1); 3525 jump_if_less(&done); 3526 for (Val id = 0; id < (Val)instructions.size(); id++) { 3527 if (!hoisted(id) && !emit(id, /*scalar=*/true)) { 3528 return false; 3529 } 3530 } 3531 for (int i = 0; i < (int)fImpl->strides.size(); i++) { 3532 if (fImpl->strides[i]) { 3533 add(arg[i], 1*fImpl->strides[i]); 3534 } 3535 } 3536 sub(N, 1); 3537 jump(&tail); 3538 } 3539 3540 a->label(&done); 3541 { 3542 exit(); 3543 } 3544 3545 // Except for explicit aligned load and store instructions, AVX allows 3546 // memory operands to be unaligned. So even though we're creating 16 3547 // byte patterns on ARM or 32-byte patterns on x86, we only need to 3548 // align to 4 bytes, the element size and alignment requirement. 3549 3550 constants.foreach([&](int imm, LabelAndReg* entry) { 3551 a->align(4); 3552 a->label(&entry->label); 3553 for (int i = 0; i < K; i++) { 3554 a->word(imm); 3555 } 3556 }); 3557 3558 bytes_masks.foreach([&](int imm, LabelAndReg* entry) { 3559 // One 16-byte pattern for ARM tbl, that same pattern twice for x86-64 vpshufb. 3560 a->align(4); 3561 a->label(&entry->label); 3562 int mask[4]; 3563 bytes_control(imm, mask); 3564 a->bytes(mask, sizeof(mask)); 3565 #if defined(__x86_64__) 3566 a->bytes(mask, sizeof(mask)); 3567 #endif 3568 }); 3569 3570 if (!iota.label.references.empty()) { 3571 a->align(4); 3572 a->label(&iota.label); 3573 for (int i = 0; i < K; i++) { 3574 a->word(i); 3575 } 3576 } 3577 3578 return true; 3579 } 3580 setupJIT(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)3581 void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions, 3582 const char* debug_name) { 3583 // Assemble with no buffer to determine a.size(), the number of bytes we'll assemble. 3584 Assembler a{nullptr}; 3585 3586 // First try allowing code hoisting (faster code) 3587 // then again without if that fails (lower register pressure). 3588 bool try_hoisting = true; 3589 if (!this->jit(instructions, try_hoisting, &a)) { 3590 try_hoisting = false; 3591 if (!this->jit(instructions, try_hoisting, &a)) { 3592 return; 3593 } 3594 } 3595 3596 // Allocate space that we can remap as executable. 3597 const size_t page = sysconf(_SC_PAGESIZE); 3598 3599 // mprotect works at page granularity. 3600 fImpl->jit_size = ((a.size() + page - 1) / page) * page; 3601 3602 void* jit_entry 3603 = mmap(nullptr,fImpl->jit_size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0); 3604 fImpl->jit_entry.store(jit_entry); 3605 3606 // Assemble the program for real. 3607 a = Assembler{jit_entry}; 3608 SkAssertResult(this->jit(instructions, try_hoisting, &a)); 3609 SkASSERT(a.size() <= fImpl->jit_size); 3610 3611 // Remap as executable, and flush caches on platforms that need that. 3612 mprotect(jit_entry, fImpl->jit_size, PROT_READ|PROT_EXEC); 3613 __builtin___clear_cache((char*)jit_entry, 3614 (char*)jit_entry + fImpl->jit_size); 3615 3616 // For profiling and debugging, it's helpful to have this code loaded 3617 // dynamically rather than just jumping info fImpl->jit_entry. 3618 if (gSkVMJITViaDylib) { 3619 // Dump the raw program binary. 3620 SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name); 3621 int fd = mkstemp(path.writable_str()); 3622 ::write(fd, jit_entry, a.size()); 3623 close(fd); 3624 3625 this->dropJIT(); // (unmap and null out fImpl->jit_entry.) 3626 3627 // Convert it in-place to a dynamic library with a single symbol "skvm_jit": 3628 SkString cmd = SkStringPrintf( 3629 "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'" 3630 " | clang -x assembler -shared - -o %s", 3631 path.c_str(), path.c_str()); 3632 system(cmd.c_str()); 3633 3634 // Load that dynamic library and look up skvm_jit(). 3635 fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL); 3636 fImpl->jit_entry.store(dlsym(fImpl->dylib, "skvm_jit")); 3637 } 3638 } 3639 #endif 3640 3641 } // namespace skvm 3642