1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkStream.h"
9 #include "include/core/SkString.h"
10 #include "include/private/SkChecksum.h"
11 #include "include/private/SkSpinlock.h"
12 #include "include/private/SkTFitsIn.h"
13 #include "include/private/SkThreadID.h"
14 #include "include/private/SkVx.h"
15 #include "src/core/SkColorSpaceXformSteps.h"
16 #include "src/core/SkCpu.h"
17 #include "src/core/SkOpts.h"
18 #include "src/core/SkVM.h"
19 #include <algorithm>
20 #include <atomic>
21 #include <queue>
22 
23 #if defined(SKVM_LLVM)
24     #include <future>
25     #include <llvm/Bitcode/BitcodeWriter.h>
26     #include <llvm/ExecutionEngine/ExecutionEngine.h>
27     #include <llvm/IR/IRBuilder.h>
28     #include <llvm/IR/Verifier.h>
29     #include <llvm/Support/TargetSelect.h>
30 #endif
31 
32 bool gSkVMJITViaDylib{false};
33 
34 // JIT code isn't MSAN-instrumented, so we won't see when it uses
35 // uninitialized memory, and we'll not see the writes it makes as properly
36 // initializing memory.  Instead force the interpreter, which should let
37 // MSAN see everything our programs do properly.
38 //
39 // Similarly, we can't get ASAN's checks unless we let it instrument our interpreter.
40 #if defined(__has_feature)
41     #if __has_feature(memory_sanitizer) || __has_feature(address_sanitizer)
42         #undef SKVM_JIT
43     #endif
44 #endif
45 
46 #if defined(SKVM_JIT)
47     #include <dlfcn.h>      // dlopen, dlsym
48     #include <sys/mman.h>   // mmap, mprotect
49 #endif
50 
51 namespace skvm {
52 
53     struct Program::Impl {
54         std::vector<InterpreterInstruction> instructions;
55         int regs = 0;
56         int loop = 0;
57         std::vector<int> strides;
58 
59         std::atomic<void*> jit_entry{nullptr};   // TODO: minimal std::memory_orders
60         size_t jit_size = 0;
61         void*  dylib    = nullptr;
62 
63     #if defined(SKVM_LLVM)
64         std::unique_ptr<llvm::LLVMContext>     llvm_ctx;
65         std::unique_ptr<llvm::ExecutionEngine> llvm_ee;
66         std::future<void>                      llvm_compiling;
67     #endif
68     };
69 
70     // Debugging tools, mostly for printing various data structures out to a stream.
71 
72     namespace {
73         class SkDebugfStream final : public SkWStream {
74             size_t fBytesWritten = 0;
75 
write(const void * buffer,size_t size)76             bool write(const void* buffer, size_t size) override {
77                 SkDebugf("%.*s", size, buffer);
78                 fBytesWritten += size;
79                 return true;
80             }
81 
bytesWritten() const82             size_t bytesWritten() const override {
83                 return fBytesWritten;
84             }
85         };
86 
87         struct V { Val id; };
88         struct R { Reg id; };
89         struct Shift { int bits; };
90         struct Splat { int bits; };
91         struct Hex   { int bits; };
92 
write(SkWStream * o,const char * s)93         static void write(SkWStream* o, const char* s) {
94             o->writeText(s);
95         }
96 
name(Op op)97         static const char* name(Op op) {
98             switch (op) {
99             #define M(x) case Op::x: return #x;
100                 SKVM_OPS(M)
101             #undef M
102             }
103             return "unknown op";
104         }
105 
write(SkWStream * o,Op op)106         static void write(SkWStream* o, Op op) {
107             const char* raw = name(op);
108             if (const char* found = strstr(raw, "_imm")) {
109                 o->write(raw, found-raw);
110             } else {
111                 o->writeText(raw);
112             }
113         }
write(SkWStream * o,Arg a)114         static void write(SkWStream* o, Arg a) {
115             write(o, "arg(");
116             o->writeDecAsText(a.ix);
117             write(o, ")");
118         }
write(SkWStream * o,V v)119         static void write(SkWStream* o, V v) {
120             write(o, "v");
121             o->writeDecAsText(v.id);
122         }
write(SkWStream * o,R r)123         static void write(SkWStream* o, R r) {
124             write(o, "r");
125             o->writeDecAsText(r.id);
126         }
write(SkWStream * o,Shift s)127         static void write(SkWStream* o, Shift s) {
128             o->writeDecAsText(s.bits);
129         }
write(SkWStream * o,Splat s)130         static void write(SkWStream* o, Splat s) {
131             float f;
132             memcpy(&f, &s.bits, 4);
133             o->writeHexAsText(s.bits);
134             write(o, " (");
135             o->writeScalarAsText(f);
136             write(o, ")");
137         }
write(SkWStream * o,Hex h)138         static void write(SkWStream* o, Hex h) {
139             o->writeHexAsText(h.bits);
140         }
141 
142         template <typename T, typename... Ts>
write(SkWStream * o,T first,Ts...rest)143         static void write(SkWStream* o, T first, Ts... rest) {
144             write(o, first);
145             write(o, " ");
146             write(o, rest...);
147         }
148     }
149 
dot(SkWStream * o,bool for_jit) const150     void Builder::dot(SkWStream* o, bool for_jit) const {
151         SkDebugfStream debug;
152         if (!o) { o = &debug; }
153 
154         std::vector<OptimizedInstruction> optimized = this->optimize(for_jit);
155 
156         o->writeText("digraph {\n");
157         for (Val id = 0; id < (Val)optimized.size(); id++) {
158             const OptimizedInstruction& i = optimized[id];
159 
160             switch (i.op) {
161                 default:
162                     write(o, "\t", V{id}, " [label = \"", V{id}, i.op);
163                     // Not a perfect heuristic; sometimes y/z == NA and there is no immy/z.
164                     // On the other hand, sometimes immy/z=0 is meaningful and should be printed.
165                     if (i.y == NA) { write(o, "", Hex{i.immy}); }
166                     if (i.z == NA) { write(o, "", Hex{i.immz}); }
167                     write(o, "\"]\n");
168 
169                     write(o, "\t", V{id}, " -> {");
170                     // In contrast to the heuristic imm labels, these dependences are exact.
171                     if (i.x != NA) { write(o, "", V{i.x}); }
172                     if (i.y != NA) { write(o, "", V{i.y}); }
173                     if (i.z != NA) { write(o, "", V{i.z}); }
174                     write(o, " }\n");
175 
176                     break;
177 
178                 // That default: impl works pretty well for most instructions,
179                 // but some are nicer to see with a specialized label.
180 
181                 case Op::splat:
182                     write(o, "\t", V{id}, " [label = \"", V{id}, i.op, Splat{i.immy}, "\"]\n");
183                     break;
184             }
185         }
186         o->writeText("}\n");
187     }
188 
dump(SkWStream * o) const189     void Builder::dump(SkWStream* o) const {
190         SkDebugfStream debug;
191         if (!o) { o = &debug; }
192 
193         std::vector<OptimizedInstruction> optimized = this->optimize();
194         o->writeDecAsText(optimized.size());
195         o->writeText(" values (originally ");
196         o->writeDecAsText(fProgram.size());
197         o->writeText("):\n");
198         for (Val id = 0; id < (Val)optimized.size(); id++) {
199             const OptimizedInstruction& inst = optimized[id];
200             Op  op = inst.op;
201             Val  x = inst.x,
202                  y = inst.y,
203                  z = inst.z;
204             int immy = inst.immy,
205                 immz = inst.immz;
206             write(o, !inst.can_hoist    ? "  " :
207                       inst.used_in_loop ? "↑ " :
208                                           "↟ ");
209             switch (op) {
210                 case Op::assert_true: write(o, op, V{x}, V{y}); break;
211 
212                 case Op::store8:  write(o, op, Arg{immy}, V{x}); break;
213                 case Op::store16: write(o, op, Arg{immy}, V{x}); break;
214                 case Op::store32: write(o, op, Arg{immy}, V{x}); break;
215 
216                 case Op::index: write(o, V{id}, "=", op); break;
217 
218                 case Op::load8:  write(o, V{id}, "=", op, Arg{immy}); break;
219                 case Op::load16: write(o, V{id}, "=", op, Arg{immy}); break;
220                 case Op::load32: write(o, V{id}, "=", op, Arg{immy}); break;
221 
222                 case Op::gather8:  write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
223                 case Op::gather16: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
224                 case Op::gather32: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}, V{x}); break;
225 
226                 case Op::uniform8:  write(o, V{id}, "=", op, Arg{immy}, Hex{immz}); break;
227                 case Op::uniform16: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}); break;
228                 case Op::uniform32: write(o, V{id}, "=", op, Arg{immy}, Hex{immz}); break;
229 
230                 case Op::splat:  write(o, V{id}, "=", op, Splat{immy}); break;
231 
232 
233                 case Op::add_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
234                 case Op::sub_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
235                 case Op::mul_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
236                 case Op::div_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
237                 case Op::min_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
238                 case Op::max_f32: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
239                 case Op::fma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
240                 case Op::fms_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
241                 case Op::fnma_f32: write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
242 
243 
244                 case Op::sqrt_f32: write(o, V{id}, "=", op, V{x}); break;
245 
246                 case Op::add_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
247                 case Op::sub_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
248                 case Op::mul_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
249                 case Op::min_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
250                 case Op::max_f32_imm: write(o, V{id}, "=", op, V{x}, Splat{immy}); break;
251 
252                 case Op:: eq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
253                 case Op::neq_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
254                 case Op:: gt_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
255                 case Op::gte_f32: write(o, V{id}, "=", op, V{x}, V{y}); break;
256 
257 
258                 case Op::add_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
259                 case Op::sub_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
260                 case Op::mul_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
261 
262                 case Op::shl_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
263                 case Op::shr_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
264                 case Op::sra_i32: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
265 
266                 case Op:: eq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
267                 case Op::neq_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
268                 case Op:: gt_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
269                 case Op::gte_i32: write(o, V{id}, "=", op, V{x}, V{y}); break;
270 
271                 case Op::add_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
272                 case Op::sub_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
273                 case Op::mul_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
274 
275                 case Op::shl_i16x2: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
276                 case Op::shr_i16x2: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
277                 case Op::sra_i16x2: write(o, V{id}, "=", op, V{x}, Shift{immy}); break;
278 
279                 case Op:: eq_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
280                 case Op::neq_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
281                 case Op:: gt_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
282                 case Op::gte_i16x2: write(o, V{id}, "=", op, V{x}, V{y}); break;
283 
284                 case Op::bit_and  : write(o, V{id}, "=", op, V{x}, V{y}      ); break;
285                 case Op::bit_or   : write(o, V{id}, "=", op, V{x}, V{y}      ); break;
286                 case Op::bit_xor  : write(o, V{id}, "=", op, V{x}, V{y}      ); break;
287                 case Op::bit_clear: write(o, V{id}, "=", op, V{x}, V{y}      ); break;
288 
289                 case Op::bit_and_imm: write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
290                 case Op::bit_or_imm : write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
291                 case Op::bit_xor_imm: write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
292 
293                 case Op::select:  write(o, V{id}, "=", op, V{x}, V{y}, V{z}); break;
294                 case Op::bytes:   write(o, V{id}, "=", op, V{x}, Hex{immy}); break;
295                 case Op::pack:    write(o, V{id}, "=", op, V{x}, V{y}, Shift{immz}); break;
296 
297                 case Op::floor:  write(o, V{id}, "=", op, V{x}); break;
298                 case Op::to_f32: write(o, V{id}, "=", op, V{x}); break;
299                 case Op::trunc:  write(o, V{id}, "=", op, V{x}); break;
300                 case Op::round:  write(o, V{id}, "=", op, V{x}); break;
301             }
302 
303             write(o, "\n");
304         }
305     }
306 
dump(SkWStream * o) const307     void Program::dump(SkWStream* o) const {
308         SkDebugfStream debug;
309         if (!o) { o = &debug; }
310 
311         o->writeDecAsText(fImpl->regs);
312         o->writeText(" registers, ");
313         o->writeDecAsText(fImpl->instructions.size());
314         o->writeText(" instructions:\n");
315         for (Val i = 0; i < (Val)fImpl->instructions.size(); i++) {
316             if (i == fImpl->loop) { write(o, "loop:\n"); }
317             o->writeDecAsText(i);
318             o->writeText("\t");
319             if (i >= fImpl->loop) { write(o, "    "); }
320             const InterpreterInstruction& inst = fImpl->instructions[i];
321             Op   op = inst.op;
322             Reg   d = inst.d,
323                   x = inst.x,
324                   y = inst.y,
325                   z = inst.z;
326             int immy = inst.immy,
327                 immz = inst.immz;
328             switch (op) {
329                 case Op::assert_true: write(o, op, R{x}, R{y}); break;
330 
331                 case Op::store8:  write(o, op, Arg{immy}, R{x}); break;
332                 case Op::store16: write(o, op, Arg{immy}, R{x}); break;
333                 case Op::store32: write(o, op, Arg{immy}, R{x}); break;
334 
335                 case Op::index: write(o, R{d}, "=", op); break;
336 
337                 case Op::load8:  write(o, R{d}, "=", op, Arg{immy}); break;
338                 case Op::load16: write(o, R{d}, "=", op, Arg{immy}); break;
339                 case Op::load32: write(o, R{d}, "=", op, Arg{immy}); break;
340 
341                 case Op::gather8:  write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
342                 case Op::gather16: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
343                 case Op::gather32: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}, R{x}); break;
344 
345                 case Op::uniform8:  write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
346                 case Op::uniform16: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
347                 case Op::uniform32: write(o, R{d}, "=", op, Arg{immy}, Hex{immz}); break;
348 
349                 case Op::splat:  write(o, R{d}, "=", op, Splat{immy}); break;
350 
351 
352                 case Op::add_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
353                 case Op::sub_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
354                 case Op::mul_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
355                 case Op::div_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
356                 case Op::min_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
357                 case Op::max_f32: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
358                 case Op::fma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
359                 case Op::fms_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
360                 case Op::fnma_f32: write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
361 
362                 case Op::sqrt_f32: write(o, R{d}, "=", op, R{x}); break;
363 
364                 case Op::add_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
365                 case Op::sub_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
366                 case Op::mul_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
367                 case Op::min_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
368                 case Op::max_f32_imm: write(o, R{d}, "=", op, R{x}, Splat{immy}); break;
369 
370                 case Op:: eq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
371                 case Op::neq_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
372                 case Op:: gt_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
373                 case Op::gte_f32: write(o, R{d}, "=", op, R{x}, R{y}); break;
374 
375 
376                 case Op::add_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
377                 case Op::sub_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
378                 case Op::mul_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
379 
380                 case Op::shl_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
381                 case Op::shr_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
382                 case Op::sra_i32: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
383 
384                 case Op:: eq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
385                 case Op::neq_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
386                 case Op:: gt_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
387                 case Op::gte_i32: write(o, R{d}, "=", op, R{x}, R{y}); break;
388 
389 
390                 case Op::add_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
391                 case Op::sub_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
392                 case Op::mul_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
393 
394                 case Op::shl_i16x2: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
395                 case Op::shr_i16x2: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
396                 case Op::sra_i16x2: write(o, R{d}, "=", op, R{x}, Shift{immy}); break;
397 
398                 case Op:: eq_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
399                 case Op::neq_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
400                 case Op:: gt_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
401                 case Op::gte_i16x2: write(o, R{d}, "=", op, R{x}, R{y}); break;
402 
403 
404                 case Op::bit_and  : write(o, R{d}, "=", op, R{x}, R{y}      ); break;
405                 case Op::bit_or   : write(o, R{d}, "=", op, R{x}, R{y}      ); break;
406                 case Op::bit_xor  : write(o, R{d}, "=", op, R{x}, R{y}      ); break;
407                 case Op::bit_clear: write(o, R{d}, "=", op, R{x}, R{y}      ); break;
408 
409                 case Op::bit_and_imm: write(o, R{d}, "=", op, R{x}, Hex{immy}); break;
410                 case Op::bit_or_imm : write(o, R{d}, "=", op, R{x}, Hex{immy}); break;
411                 case Op::bit_xor_imm: write(o, R{d}, "=", op, R{x}, Hex{immy}); break;
412 
413                 case Op::select:  write(o, R{d}, "=", op, R{x}, R{y}, R{z}); break;
414                 case Op::bytes:   write(o, R{d}, "=", op,  R{x}, Hex{immy}); break;
415                 case Op::pack:    write(o, R{d}, "=", op,   R{x}, R{y}, Shift{immz}); break;
416 
417                 case Op::floor:  write(o, R{d}, "=", op, R{x}); break;
418                 case Op::to_f32: write(o, R{d}, "=", op, R{x}); break;
419                 case Op::trunc:  write(o, R{d}, "=", op, R{x}); break;
420                 case Op::round:  write(o, R{d}, "=", op, R{x}); break;
421             }
422             write(o, "\n");
423         }
424     }
425 
specialize_for_jit(std::vector<Instruction> * program)426     void specialize_for_jit(std::vector<Instruction>* program) {
427         Builder specialized;
428         for (Val i = 0; i < (Val)program->size(); i++) {
429             Instruction inst = (*program)[i];
430 
431             #if defined(SK_CPU_X86)
432             auto is_imm = [&](Val id, int* bits) {
433                 *bits = (*program)[id].immy;
434                 return  (*program)[id].op == Op::splat;
435             };
436 
437             Op imm_op;
438             int bits;
439             switch (inst.op) {
440                 default: break;
441 
442                 case Op::add_f32: imm_op = Op::add_f32_imm; goto try_imm_x_and_y;
443                 case Op::mul_f32: imm_op = Op::mul_f32_imm; goto try_imm_x_and_y;
444                 case Op::min_f32: imm_op = Op::min_f32_imm; goto try_imm_x_and_y;
445                 case Op::max_f32: imm_op = Op::max_f32_imm; goto try_imm_x_and_y;
446                 case Op::bit_and: imm_op = Op::bit_and_imm; goto try_imm_x_and_y;
447                 case Op::bit_or:  imm_op = Op::bit_or_imm ; goto try_imm_x_and_y;
448                 case Op::bit_xor: imm_op = Op::bit_xor_imm; goto try_imm_x_and_y;
449 
450                 try_imm_x_and_y:
451                     if (is_imm(inst.x, &bits)) {
452                         inst.op   = imm_op;
453                         inst.x    = inst.y;
454                         inst.y    = NA;
455                         inst.immy = bits;
456                     } else if (is_imm(inst.y, &bits)) {
457                         inst.op   = imm_op;
458                         inst.y    = NA;
459                         inst.immy = bits;
460                     } break;
461 
462                 case Op::sub_f32:
463                     if (is_imm(inst.y, &bits)) {
464                         inst.op   = Op::sub_f32_imm;
465                         inst.y    = NA;
466                         inst.immy = bits;
467                     } break;
468 
469                 case Op::bit_clear:
470                     if (is_imm(inst.y, &bits)) {
471                         inst.op   = Op::bit_and_imm;
472                         inst.y    = NA;
473                         inst.immy = ~bits;
474                     } break;
475             }
476             #endif
477             SkDEBUGCODE(Val id =) specialized.push(inst);
478             // If we replace single instructions with multiple, this will start breaking,
479             // and we'll need a table to remap them like we have in optimize().
480             SkASSERT(id == i);
481         }
482 
483         *program = specialized.program();
484     }
485 
optimize(bool for_jit) const486     std::vector<OptimizedInstruction> Builder::optimize(bool for_jit) const {
487         std::vector<Instruction> program = this->program();
488         if (for_jit) {
489             specialize_for_jit(&program);
490         }
491 
492         std::vector<bool> live_instructions;
493         std::vector<Val> frontier;
494         int liveInstructionCount = liveness_analysis(program, &live_instructions, &frontier);
495         skvm::Usage usage{program, live_instructions};
496 
497         std::vector<int> remaining_uses;
498         for (Val id = 0; id < (Val)program.size(); id++) {
499             remaining_uses.push_back((int)usage.users(id).size());
500         }
501 
502         // Map old Val index to rewritten index in optimized.
503         std::vector<Val> new_index(program.size(), NA);
504 
505         auto pressure_change = [&](Val id) -> int {
506             int pressure = 0;
507             Instruction inst = program[id];
508 
509             // If this is not a sink, then it takes up a register
510             if (inst.op > Op::store32) { pressure += 1; }
511 
512             // If this is the last use of the value, then that register will be free.
513             if (inst.x != NA && remaining_uses[inst.x] == 1) { pressure -= 1; }
514             if (inst.y != NA && remaining_uses[inst.y] == 1) { pressure -= 1; }
515             if (inst.z != NA && remaining_uses[inst.z] == 1) { pressure -= 1; }
516             return pressure;
517         };
518 
519         auto compare = [&](Val lhs, Val rhs) {
520             SkASSERT(lhs != rhs);
521             int lhs_change = pressure_change(lhs);
522             int rhs_change = pressure_change(rhs);
523 
524             // This comparison operator orders instructions from least (likely negative) register
525             // pressure to most register pressure,  breaking ties arbitrarily using original
526             // program order comparing the instruction index itself.
527             //
528             // We'll use this operator with std::{make,push,pop}_heap() to maintain a max heap
529             // frontier of instructions that are ready to schedule.  We iterate backwards through
530             // the program, scheduling later instruction slots before earlier ones, and that means
531             // an instruction becomes ready to schedule once all instructions using its result have
532             // been scheduled (in later slots).
533             //
534             // All together that means we'll be issuing the instructions that hurt register pressure
535             // as late as possible, and issuing the instructions that help register pressure as soon
536             // as possible.
537             //
538             // This heuristic of greedily issuing the instruction that most immediately decreases
539             // register pressure approximates a more expensive search to find a schedule that
540             // minimizes the high-water maximum register pressure, the number of registers we'll
541             // need to run this program.
542             //
543             // The tie-breaker heuristic was found through experimentation.
544             return lhs_change < rhs_change || (lhs_change == rhs_change && lhs > rhs);
545         };
546 
547         // Order the instructions.
548         std::make_heap(frontier.begin(), frontier.end(), compare);
549 
550         // Schedule the instructions last to first from the DAG. Produce a schedule that executes
551         // instructions that reduce register pressure before ones that increase register
552         // pressure.
553         std::vector<OptimizedInstruction> optimized;
554         optimized.resize(liveInstructionCount);
555         for (int i = liveInstructionCount; i-- > 0;) {
556             SkASSERT(!frontier.empty());
557             std::pop_heap(frontier.begin(), frontier.end(), compare);
558             Val id = frontier.back();
559             frontier.pop_back();
560             new_index[id] = i;
561             Instruction inst = program[id];
562             SkASSERT(remaining_uses[id] == 0);
563 
564             // Use the old indices, and fix them up later.
565             optimized[i] = {inst.op,
566                             inst.x, inst.y, inst.z,
567                             inst.immy, inst.immz,
568                              /*death=*/0, /*can_hoist=*/true, /*used_in_loop=*/false};
569 
570             auto maybe_issue = [&](Val input) {
571               if (input != NA) {
572                   if (remaining_uses[input] == 1) {
573                       frontier.push_back(input);
574                       std::push_heap(frontier.begin(), frontier.end(), compare);
575                   }
576                   remaining_uses[input]--;
577               }
578             };
579             maybe_issue(inst.x);
580             maybe_issue(inst.y);
581             maybe_issue(inst.z);
582         }
583 
584         // Fix up the optimized program to use the optimized indices.
585         for (Val id = 0; id < (Val)optimized.size(); id++) {
586             OptimizedInstruction& inst = optimized[id];
587             if (inst.x != NA ) { inst.x = new_index[inst.x]; }
588             if (inst.y != NA ) { inst.y = new_index[inst.y]; }
589             if (inst.z != NA ) { inst.z = new_index[inst.z]; }
590         }
591 
592         SkASSERT(frontier.empty());
593 
594         // We're done with `program` now... everything below will analyze `optimized`.
595 
596         // We'll want to know when it's safe to recycle registers holding the values
597         // produced by each instruction, that is, when no future instruction needs it.
598         for (Val id = 0; id < (Val)optimized.size(); id++) {
599             OptimizedInstruction& inst = optimized[id];
600             // Stores don't really produce values.  Just mark them as dying on issue.
601             if (inst.op <= Op::store32) {
602                 inst.death = id;
603             }
604             // Extend the lifetime of this instruction's inputs to live until it issues.
605             // (We're walking in order, so this is the same as max()ing.)
606             if (inst.x != NA) { optimized[inst.x].death = id; }
607             if (inst.y != NA) { optimized[inst.y].death = id; }
608             if (inst.z != NA) { optimized[inst.z].death = id; }
609         }
610 
611         // Mark which values don't depend on the loop and can be hoisted.
612         for (Val id = 0; id < (Val)optimized.size(); id++) {
613             OptimizedInstruction& inst = optimized[id];
614 
615             // Varying loads (and gathers) and stores cannot be hoisted out of the loop.
616             if (inst.op <= Op::gather32 && inst.op != Op::assert_true) {
617                 inst.can_hoist = false;
618             }
619 
620             // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
621             if (inst.can_hoist) {
622                 if (inst.x != NA) { inst.can_hoist &= optimized[inst.x].can_hoist; }
623                 if (inst.y != NA) { inst.can_hoist &= optimized[inst.y].can_hoist; }
624                 if (inst.z != NA) { inst.can_hoist &= optimized[inst.z].can_hoist; }
625             }
626 
627             // We'll want to know if hoisted values are used in the loop;
628             // if not, we can recycle their registers like we do loop values.
629             if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used_in_loop*/) {
630                 if (inst.x != NA) { optimized[inst.x].used_in_loop = true; }
631                 if (inst.y != NA) { optimized[inst.y].used_in_loop = true; }
632                 if (inst.z != NA) { optimized[inst.z].used_in_loop = true; }
633             }
634         }
635 
636         return optimized;
637     }
638 
done(const char * debug_name) const639     Program Builder::done(const char* debug_name) const {
640         char buf[64] = "skvm-jit-";
641         if (!debug_name) {
642             *SkStrAppendU32(buf+9, this->hash()) = '\0';
643             debug_name = buf;
644         }
645 
646     #if defined(SKVM_LLVM) || defined(SKVM_JIT)
647         return {this->optimize(false), this->optimize(true), fStrides, debug_name};
648     #else
649         return {this->optimize(false), fStrides};
650     #endif
651     }
652 
hash() const653     uint64_t Builder::hash() const {
654         uint32_t lo = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 0),
655                  hi = SkOpts::hash(fProgram.data(), fProgram.size() * sizeof(Instruction), 1);
656         return (uint64_t)lo | (uint64_t)hi << 32;
657     }
658 
operator ==(const Instruction & a,const Instruction & b)659     bool operator==(const Instruction& a, const Instruction& b) {
660         return a.op   == b.op
661             && a.x    == b.x
662             && a.y    == b.y
663             && a.z    == b.z
664             && a.immy == b.immy
665             && a.immz == b.immz;
666     }
667 
operator ()(const Instruction & inst,uint32_t seed) const668     uint32_t InstructionHash::operator()(const Instruction& inst, uint32_t seed) const {
669         return SkOpts::hash(&inst, sizeof(inst), seed);
670     }
671 
672 
673     // Most instructions produce a value and return it by ID,
674     // the value-producing instruction's own index in the program vector.
push(Instruction inst)675     Val Builder::push(Instruction inst) {
676         // Basic common subexpression elimination:
677         // if we've already seen this exact Instruction, use it instead of creating a new one.
678         if (Val* id = fIndex.find(inst)) {
679             return *id;
680         }
681         Val id = static_cast<Val>(fProgram.size());
682         fProgram.push_back(inst);
683         fIndex.set(inst, id);
684         return id;
685     }
686 
allImm() const687     bool Builder::allImm() const { return true; }
688 
689     template <typename T, typename... Rest>
allImm(Val id,T * imm,Rest...rest) const690     bool Builder::allImm(Val id, T* imm, Rest... rest) const {
691         if (fProgram[id].op == Op::splat) {
692             static_assert(sizeof(T) == 4, "");
693             memcpy(imm, &fProgram[id].immy, 4);
694             return this->allImm(rest...);
695         }
696         return false;
697     }
698 
arg(int stride)699     Arg Builder::arg(int stride) {
700         int ix = (int)fStrides.size();
701         fStrides.push_back(stride);
702         return {ix};
703     }
704 
assert_true(I32 cond,I32 debug)705     void Builder::assert_true(I32 cond, I32 debug) {
706     #ifdef SK_DEBUG
707         int imm;
708         if (this->allImm(cond.id,&imm)) { SkASSERT(imm); return; }
709         (void)push(Op::assert_true, cond.id,debug.id,NA);
710     #endif
711     }
712 
store8(Arg ptr,I32 val)713     void Builder::store8 (Arg ptr, I32 val) { (void)push(Op::store8 , val.id,NA,NA, ptr.ix); }
store16(Arg ptr,I32 val)714     void Builder::store16(Arg ptr, I32 val) { (void)push(Op::store16, val.id,NA,NA, ptr.ix); }
store32(Arg ptr,I32 val)715     void Builder::store32(Arg ptr, I32 val) { (void)push(Op::store32, val.id,NA,NA, ptr.ix); }
716 
index()717     I32 Builder::index() { return {this, push(Op::index , NA,NA,NA,0) }; }
718 
load8(Arg ptr)719     I32 Builder::load8 (Arg ptr) { return {this, push(Op::load8 , NA,NA,NA, ptr.ix) }; }
load16(Arg ptr)720     I32 Builder::load16(Arg ptr) { return {this, push(Op::load16, NA,NA,NA, ptr.ix) }; }
load32(Arg ptr)721     I32 Builder::load32(Arg ptr) { return {this, push(Op::load32, NA,NA,NA, ptr.ix) }; }
722 
gather8(Arg ptr,int offset,I32 index)723     I32 Builder::gather8 (Arg ptr, int offset, I32 index) {
724         return {this, push(Op::gather8 , index.id,NA,NA, ptr.ix,offset)};
725     }
gather16(Arg ptr,int offset,I32 index)726     I32 Builder::gather16(Arg ptr, int offset, I32 index) {
727         return {this, push(Op::gather16, index.id,NA,NA, ptr.ix,offset)};
728     }
gather32(Arg ptr,int offset,I32 index)729     I32 Builder::gather32(Arg ptr, int offset, I32 index) {
730         return {this, push(Op::gather32, index.id,NA,NA, ptr.ix,offset)};
731     }
732 
uniform8(Arg ptr,int offset)733     I32 Builder::uniform8(Arg ptr, int offset) {
734         return {this, push(Op::uniform8, NA,NA,NA, ptr.ix, offset)};
735     }
uniform16(Arg ptr,int offset)736     I32 Builder::uniform16(Arg ptr, int offset) {
737         return {this, push(Op::uniform16, NA,NA,NA, ptr.ix, offset)};
738     }
uniform32(Arg ptr,int offset)739     I32 Builder::uniform32(Arg ptr, int offset) {
740         return {this, push(Op::uniform32, NA,NA,NA, ptr.ix, offset)};
741     }
742 
743     // The two splat() functions are just syntax sugar over splatting a 4-byte bit pattern.
splat(int n)744     I32 Builder::splat(int   n) { return {this, push(Op::splat, NA,NA,NA, n) }; }
splat(float f)745     F32 Builder::splat(float f) {
746         int bits;
747         memcpy(&bits, &f, 4);
748         return {this, push(Op::splat, NA,NA,NA, bits)};
749     }
750 
fma_supported()751     static bool fma_supported() {
752         static const bool supported =
753      #if defined(SK_CPU_X86)
754          SkCpu::Supports(SkCpu::HSW);
755      #elif defined(SK_CPU_ARM64)
756          true;
757      #else
758          false;
759      #endif
760          return supported;
761     }
762 
763     // Be careful peepholing float math!  Transformations you might expect to
764     // be legal can fail in the face of NaN/Inf, e.g. 0*x is not always 0.
765     // Float peepholes must pass this equivalence test for all ~4B floats:
766     //
767     //     bool equiv(float x, float y) { return (x == y) || (isnanf(x) && isnanf(y)); }
768     //
769     //     unsigned bits = 0;
770     //     do {
771     //        float f;
772     //        memcpy(&f, &bits, 4);
773     //        if (!equiv(f, ...)) {
774     //           abort();
775     //        }
776     //     } while (++bits != 0);
777 
add(F32 x,F32 y)778     F32 Builder::add(F32 x, F32 y) {
779         float X,Y;
780         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X+Y); }
781         if (this->isImm(y.id, 0.0f)) { return x; }   // x+0 == x
782         if (this->isImm(x.id, 0.0f)) { return y; }   // 0+y == y
783 
784         if (fma_supported()) {
785             if (fProgram[x.id].op == Op::mul_f32) {
786                 return {this, push(Op::fma_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
787             }
788             if (fProgram[y.id].op == Op::mul_f32) {
789                 return {this, push(Op::fma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
790             }
791         }
792         return {this, push(Op::add_f32, x.id, y.id)};
793     }
794 
sub(F32 x,F32 y)795     F32 Builder::sub(F32 x, F32 y) {
796         float X,Y;
797         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X-Y); }
798         if (this->isImm(y.id, 0.0f)) { return x; }   // x-0 == x
799         if (fma_supported()) {
800             if (fProgram[x.id].op == Op::mul_f32) {
801                 return {this, push(Op::fms_f32, fProgram[x.id].x, fProgram[x.id].y, y.id)};
802             }
803             if (fProgram[y.id].op == Op::mul_f32) {
804                 return {this, push(Op::fnma_f32, fProgram[y.id].x, fProgram[y.id].y, x.id)};
805             }
806         }
807         return {this, push(Op::sub_f32, x.id, y.id)};
808     }
809 
mul(F32 x,F32 y)810     F32 Builder::mul(F32 x, F32 y) {
811         float X,Y;
812         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X*Y); }
813         if (this->isImm(y.id, 1.0f)) { return x; }  // x*1 == x
814         if (this->isImm(x.id, 1.0f)) { return y; }  // 1*y == y
815         return {this, push(Op::mul_f32, x.id, y.id)};
816     }
817 
div(F32 x,F32 y)818     F32 Builder::div(F32 x, F32 y) {
819         float X,Y;
820         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X/Y); }
821         if (this->isImm(y.id, 1.0f)) { return x; }  // x/1 == x
822         return {this, push(Op::div_f32, x.id, y.id)};
823     }
824 
sqrt(F32 x)825     F32 Builder::sqrt(F32 x) {
826         float X;
827         if (this->allImm(x.id,&X)) { return this->splat(std::sqrt(X)); }
828         return {this, push(Op::sqrt_f32, x.id,NA,NA)};
829     }
830 
831     // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html.
approx_log2(F32 x)832     F32 Builder::approx_log2(F32 x) {
833         // e - 127 is a fair approximation of log2(x) in its own right...
834         F32 e = mul(to_f32(bit_cast(x)), splat(1.0f / (1<<23)));
835 
836         // ... but using the mantissa to refine its error is _much_ better.
837         F32 m = bit_cast(bit_or(bit_and(bit_cast(x), 0x007fffff),
838                                 0x3f000000));
839         F32 approx = sub(e,        124.225514990f);
840             approx = sub(approx, mul(1.498030302f, m));
841             approx = sub(approx, div(1.725879990f, add(0.3520887068f, m)));
842 
843         return approx;
844     }
845 
approx_pow2(F32 x)846     F32 Builder::approx_pow2(F32 x) {
847         F32 f = fract(x);
848         F32 approx = add(x,         121.274057500f);
849             approx = sub(approx, mul( 1.490129070f, f));
850             approx = add(approx, div(27.728023300f, sub(4.84252568f, f)));
851 
852         return bit_cast(round(mul(1.0f * (1<<23), approx)));
853     }
854 
approx_powf(F32 x,F32 y)855     F32 Builder::approx_powf(F32 x, F32 y) {
856         auto is_x = bit_or(eq(x, 0.0f),
857                            eq(x, 1.0f));
858         return select(is_x, x, approx_pow2(mul(approx_log2(x), y)));
859     }
860 
min(F32 x,F32 y)861     F32 Builder::min(F32 x, F32 y) {
862         float X,Y;
863         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(std::min(X,Y)); }
864         return {this, push(Op::min_f32, x.id, y.id)};
865     }
max(F32 x,F32 y)866     F32 Builder::max(F32 x, F32 y) {
867         float X,Y;
868         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(std::max(X,Y)); }
869         return {this, push(Op::max_f32, x.id, y.id)};
870     }
871 
add(I32 x,I32 y)872     I32 Builder::add(I32 x, I32 y) { return {this, push(Op::add_i32, x.id, y.id)}; }
sub(I32 x,I32 y)873     I32 Builder::sub(I32 x, I32 y) { return {this, push(Op::sub_i32, x.id, y.id)}; }
mul(I32 x,I32 y)874     I32 Builder::mul(I32 x, I32 y) { return {this, push(Op::mul_i32, x.id, y.id)}; }
875 
add_16x2(I32 x,I32 y)876     I32 Builder::add_16x2(I32 x, I32 y) { return {this, push(Op::add_i16x2, x.id, y.id)}; }
sub_16x2(I32 x,I32 y)877     I32 Builder::sub_16x2(I32 x, I32 y) { return {this, push(Op::sub_i16x2, x.id, y.id)}; }
mul_16x2(I32 x,I32 y)878     I32 Builder::mul_16x2(I32 x, I32 y) { return {this, push(Op::mul_i16x2, x.id, y.id)}; }
879 
shl(I32 x,int bits)880     I32 Builder::shl(I32 x, int bits) {
881         if (bits == 0) { return x; }
882         int X;
883         if (this->allImm(x.id,&X)) { return this->splat(X << bits); }
884         return {this, push(Op::shl_i32, x.id,NA,NA, bits)};
885     }
shr(I32 x,int bits)886     I32 Builder::shr(I32 x, int bits) {
887         if (bits == 0) { return x; }
888         int X;
889         if (this->allImm(x.id,&X)) { return this->splat(unsigned(X) >> bits); }
890         return {this, push(Op::shr_i32, x.id,NA,NA, bits)};
891     }
sra(I32 x,int bits)892     I32 Builder::sra(I32 x, int bits) {
893         if (bits == 0) { return x; }
894         int X;
895         if (this->allImm(x.id,&X)) { return this->splat(X >> bits); }
896         return {this, push(Op::sra_i32, x.id,NA,NA, bits)};
897     }
898 
shl_16x2(I32 x,int k)899     I32 Builder::shl_16x2(I32 x, int k) { return {this, push(Op::shl_i16x2, x.id,NA,NA, k)}; }
shr_16x2(I32 x,int k)900     I32 Builder::shr_16x2(I32 x, int k) { return {this, push(Op::shr_i16x2, x.id,NA,NA, k)}; }
sra_16x2(I32 x,int k)901     I32 Builder::sra_16x2(I32 x, int k) { return {this, push(Op::sra_i16x2, x.id,NA,NA, k)}; }
902 
eq(F32 x,F32 y)903     I32 Builder:: eq(F32 x, F32 y) {
904         float X,Y;
905         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X==Y ? ~0 : 0); }
906         return {this, push(Op::eq_f32, x.id, y.id)};
907     }
neq(F32 x,F32 y)908     I32 Builder::neq(F32 x, F32 y) {
909         float X,Y;
910         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X!=Y ? ~0 : 0); }
911         return {this, push(Op::neq_f32, x.id, y.id)};
912     }
lt(F32 x,F32 y)913     I32 Builder::lt(F32 x, F32 y) {
914         float X,Y;
915         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(Y> X ? ~0 : 0); }
916         return {this, push(Op::gt_f32, y.id, x.id)};
917     }
lte(F32 x,F32 y)918     I32 Builder::lte(F32 x, F32 y) {
919         float X,Y;
920         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(Y>=X ? ~0 : 0); }
921         return {this, push(Op::gte_f32, y.id, x.id)};
922     }
gt(F32 x,F32 y)923     I32 Builder::gt(F32 x, F32 y) {
924         float X,Y;
925         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X> Y ? ~0 : 0); }
926         return {this, push(Op::gt_f32, x.id, y.id)};
927     }
gte(F32 x,F32 y)928     I32 Builder::gte(F32 x, F32 y) {
929         float X,Y;
930         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X>=Y ? ~0 : 0); }
931         return {this, push(Op::gte_f32, x.id, y.id)};
932     }
933 
eq(I32 x,I32 y)934     I32 Builder:: eq(I32 x, I32 y) { return {this, push(Op:: eq_i32, x.id, y.id)}; }
neq(I32 x,I32 y)935     I32 Builder::neq(I32 x, I32 y) { return {this, push(Op::neq_i32, x.id, y.id)}; }
lt(I32 x,I32 y)936     I32 Builder:: lt(I32 x, I32 y) { return {this, push(Op:: gt_i32, y.id, x.id)}; }
lte(I32 x,I32 y)937     I32 Builder::lte(I32 x, I32 y) { return {this, push(Op::gte_i32, y.id, x.id)}; }
gt(I32 x,I32 y)938     I32 Builder:: gt(I32 x, I32 y) { return {this, push(Op:: gt_i32, x.id, y.id)}; }
gte(I32 x,I32 y)939     I32 Builder::gte(I32 x, I32 y) { return {this, push(Op::gte_i32, x.id, y.id)}; }
940 
eq_16x2(I32 x,I32 y)941     I32 Builder:: eq_16x2(I32 x, I32 y) { return {this, push(Op:: eq_i16x2, x.id, y.id)}; }
neq_16x2(I32 x,I32 y)942     I32 Builder::neq_16x2(I32 x, I32 y) { return {this, push(Op::neq_i16x2, x.id, y.id)}; }
lt_16x2(I32 x,I32 y)943     I32 Builder:: lt_16x2(I32 x, I32 y) { return {this, push(Op:: gt_i16x2, y.id, x.id)}; }
lte_16x2(I32 x,I32 y)944     I32 Builder::lte_16x2(I32 x, I32 y) { return {this, push(Op::gte_i16x2, y.id, x.id)}; }
gt_16x2(I32 x,I32 y)945     I32 Builder:: gt_16x2(I32 x, I32 y) { return {this, push(Op:: gt_i16x2, x.id, y.id)}; }
gte_16x2(I32 x,I32 y)946     I32 Builder::gte_16x2(I32 x, I32 y) { return {this, push(Op::gte_i16x2, x.id, y.id)}; }
947 
bit_and(I32 x,I32 y)948     I32 Builder::bit_and(I32 x, I32 y) {
949         int X,Y;
950         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X&Y); }
951         if (this->isImm(y.id, 0)) { return this->splat(0); }   // (x & false) == false
952         if (this->isImm(x.id, 0)) { return this->splat(0); }   // (false & y) == false
953         if (this->isImm(y.id,~0)) { return x; }                // (x & true) == x
954         if (this->isImm(x.id,~0)) { return y; }                // (true & y) == y
955         return {this, push(Op::bit_and, x.id, y.id)};
956     }
bit_or(I32 x,I32 y)957     I32 Builder::bit_or(I32 x, I32 y) {
958         int X,Y;
959         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X|Y); }
960         if (this->isImm(y.id, 0)) { return x; }                 // (x | false) == x
961         if (this->isImm(x.id, 0)) { return y; }                 // (false | y) == y
962         if (this->isImm(y.id,~0)) { return this->splat(~0); }   // (x | true) == true
963         if (this->isImm(x.id,~0)) { return this->splat(~0); }   // (true | y) == true
964         return {this, push(Op::bit_or, x.id, y.id)};
965     }
bit_xor(I32 x,I32 y)966     I32 Builder::bit_xor(I32 x, I32 y) {
967         int X,Y;
968         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X^Y); }
969         if (this->isImm(y.id, 0)) { return x; }   // (x ^ false) == x
970         if (this->isImm(x.id, 0)) { return y; }   // (false ^ y) == y
971         return {this, push(Op::bit_xor, x.id, y.id)};
972     }
bit_clear(I32 x,I32 y)973     I32 Builder::bit_clear(I32 x, I32 y) {
974         int X,Y;
975         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X&~Y); }
976         if (this->isImm(y.id, 0)) { return x; }                // (x & ~false) == x
977         if (this->isImm(y.id,~0)) { return this->splat(0); }   // (x & ~true) == false
978         if (this->isImm(x.id, 0)) { return this->splat(0); }   // (false & ~y) == false
979         return {this, push(Op::bit_clear, x.id, y.id)};
980     }
981 
select(I32 x,I32 y,I32 z)982     I32 Builder::select(I32 x, I32 y, I32 z) {
983         int X,Y,Z;
984         if (this->allImm(x.id,&X, y.id,&Y, z.id,&Z)) { return this->splat(X?Y:Z); }
985         // TODO: some cases to reduce to bit_and when y == 0 or z == 0?
986         return {this, push(Op::select, x.id, y.id, z.id)};
987     }
988 
extract(I32 x,int bits,I32 z)989     I32 Builder::extract(I32 x, int bits, I32 z) {
990         int Z;
991         if (this->allImm(z.id,&Z) && (~0u>>bits) == (unsigned)Z) { return this->shr(x, bits); }
992         return this->bit_and(z, this->shr(x, bits));
993     }
994 
pack(I32 x,I32 y,int bits)995     I32 Builder::pack(I32 x, I32 y, int bits) {
996         int X,Y;
997         if (this->allImm(x.id,&X, y.id,&Y)) { return this->splat(X|(Y<<bits)); }
998         return {this, push(Op::pack, x.id,y.id,NA, 0,bits)};
999     }
1000 
bytes(I32 x,int control)1001     I32 Builder::bytes(I32 x, int control) {
1002         return {this, push(Op::bytes, x.id,NA,NA, control)};
1003     }
1004 
floor(F32 x)1005     F32 Builder::floor(F32 x) {
1006         float X;
1007         if (this->allImm(x.id,&X)) { return this->splat(floorf(X)); }
1008         return {this, push(Op::floor, x.id)};
1009     }
to_f32(I32 x)1010     F32 Builder::to_f32(I32 x) {
1011         int X;
1012         if (this->allImm(x.id,&X)) { return this->splat((float)X); }
1013         return {this, push(Op::to_f32, x.id)};
1014     }
trunc(F32 x)1015     I32 Builder::trunc(F32 x) {
1016         float X;
1017         if (this->allImm(x.id,&X)) { return this->splat((int)X); }
1018         return {this, push(Op::trunc, x.id)};
1019     }
round(F32 x)1020     I32 Builder::round(F32 x) {
1021         float X;
1022         if (this->allImm(x.id,&X)) { return this->splat((int)lrintf(X)); }
1023         return {this, push(Op::round, x.id)};
1024     }
1025 
from_unorm(int bits,I32 x)1026     F32 Builder::from_unorm(int bits, I32 x) {
1027         F32 limit = splat(1 / ((1<<bits)-1.0f));
1028         return mul(to_f32(x), limit);
1029     }
to_unorm(int bits,F32 x)1030     I32 Builder::to_unorm(int bits, F32 x) {
1031         F32 limit = splat((1<<bits)-1.0f);
1032         return round(mul(x, limit));
1033     }
1034 
unpack_1010102(I32 rgba)1035     Color Builder::unpack_1010102(I32 rgba) {
1036         return {
1037             from_unorm(10, extract(rgba,  0, 0x3ff)),
1038             from_unorm(10, extract(rgba, 10, 0x3ff)),
1039             from_unorm(10, extract(rgba, 20, 0x3ff)),
1040             from_unorm( 2, extract(rgba, 30, 0x3  )),
1041         };
1042     }
unpack_8888(I32 rgba)1043     Color Builder::unpack_8888(I32 rgba) {
1044         return {
1045             from_unorm(8, extract(rgba,  0, 0xff)),
1046             from_unorm(8, extract(rgba,  8, 0xff)),
1047             from_unorm(8, extract(rgba, 16, 0xff)),
1048             from_unorm(8, extract(rgba, 24, 0xff)),
1049         };
1050     }
unpack_565(I32 bgr)1051     Color Builder::unpack_565(I32 bgr) {
1052         return {
1053             from_unorm(5, extract(bgr, 11, 0b011'111)),
1054             from_unorm(6, extract(bgr,  5, 0b111'111)),
1055             from_unorm(5, extract(bgr,  0, 0b011'111)),
1056             splat(1.0f),
1057         };
1058     }
1059 
unpremul(F32 * r,F32 * g,F32 * b,F32 a)1060     void Builder::unpremul(F32* r, F32* g, F32* b, F32 a) {
1061         skvm::F32 invA = div(1.0f, a),
1062                   inf  = bit_cast(splat(0x7f800000));
1063         // If a is 0, so are *r,*g,*b, so set invA to 0 to avoid 0*inf=NaN (instead 0*0 = 0).
1064         invA = bit_cast(bit_and(lt(invA, inf),
1065                                 bit_cast(invA)));
1066         *r = mul(*r, invA);
1067         *g = mul(*g, invA);
1068         *b = mul(*b, invA);
1069     }
1070 
premul(F32 * r,F32 * g,F32 * b,F32 a)1071     void Builder::premul(F32* r, F32* g, F32* b, F32 a) {
1072         *r = mul(*r, a);
1073         *g = mul(*g, a);
1074         *b = mul(*b, a);
1075     }
1076 
uniformPremul(SkColor4f color,SkColorSpace * src,Uniforms * uniforms,SkColorSpace * dst)1077     Color Builder::uniformPremul(SkColor4f color,    SkColorSpace* src,
1078                                  Uniforms* uniforms, SkColorSpace* dst) {
1079         SkColorSpaceXformSteps(src, kUnpremul_SkAlphaType,
1080                                dst,   kPremul_SkAlphaType).apply(color.vec());
1081         return {
1082             uniformF(uniforms->pushF(color.fR)),
1083             uniformF(uniforms->pushF(color.fG)),
1084             uniformF(uniforms->pushF(color.fB)),
1085             uniformF(uniforms->pushF(color.fA)),
1086         };
1087     }
1088 
lerp(Color lo,Color hi,F32 t)1089     Color Builder::lerp(Color lo, Color hi, F32 t) {
1090         return {
1091             lerp(lo.r, hi.r, t),
1092             lerp(lo.g, hi.g, t),
1093             lerp(lo.b, hi.b, t),
1094             lerp(lo.a, hi.a, t),
1095         };
1096     }
1097 
to_hsla(Color c)1098     HSLA Builder::to_hsla(Color c) {
1099         F32 mx = max(max(c.r,c.g),c.b),
1100             mn = min(min(c.r,c.g),c.b),
1101              d = mx - mn,
1102         g_lt_b = select(c.g < c.b, splat(6.0f)
1103                                  , splat(0.0f));
1104 
1105         auto diffm = [&](auto a, auto b) {
1106             return (a - b) * (1 / d);
1107         };
1108 
1109         F32 h = mul(1/6.0f,
1110                         select(eq(mx,  mn), 0.0f,
1111                         select(eq(mx, c.r), add(diffm(c.g,c.b), g_lt_b),
1112                         select(eq(mx, c.g), add(diffm(c.b,c.r), 2.0f)
1113                                           , add(diffm(c.r,c.g), 4.0f)))));
1114 
1115         F32 sum = add(mx,mn);
1116         F32   l = mul(sum, 0.5f);
1117         F32   s = select(eq(mx,mn), 0.0f
1118                                   , div(d, select(gt(l,0.5f), sub(2.0f,sum)
1119                                                             , sum)));
1120         return {h, s, l, c.a};
1121     }
1122 
to_rgba(HSLA c)1123     Color Builder::to_rgba(HSLA c) {
1124         // See GrRGBToHSLFilterEffect.fp
1125 
1126         auto h = c.h;
1127         auto s = c.s;
1128         auto l = c.l;
1129         F32 x = mul(sub(1.0f, abs(sub(add(l,l), 1.0f))), s);
1130 
1131         auto hue_to_rgb = [&](auto hue) {
1132             auto q = sub(abs(mad(fract(hue), splat(6.0f), splat(-3.0f))), splat(1.0f));
1133             return mad(sub(clamp01(q), splat(0.5f)), x, l);
1134         };
1135 
1136         return {
1137             hue_to_rgb(add(h, 0/3.0f)),
1138             hue_to_rgb(add(h, 2/3.0f)),
1139             hue_to_rgb(add(h, 1/3.0f)),
1140             c.a,
1141         };
1142     }
1143 
1144     // We're basing our implementation of non-separable blend modes on
1145     //   https://www.w3.org/TR/compositing-1/#blendingnonseparable.
1146     // and
1147     //   https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
1148     // They're equivalent, but ES' math has been better simplified.
1149     //
1150     // Anything extra we add beyond that is to make the math work with premul inputs.
1151 
saturation(skvm::Builder * p,skvm::F32 r,skvm::F32 g,skvm::F32 b)1152     static skvm::F32 saturation(skvm::Builder* p, skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1153         return max(r, max(g, b))
1154              - min(r, min(g, b));
1155     }
1156 
luminance(skvm::Builder * p,skvm::F32 r,skvm::F32 g,skvm::F32 b)1157     static skvm::F32 luminance(skvm::Builder* p, skvm::F32 r, skvm::F32 g, skvm::F32 b) {
1158         return r*0.30f + (g*0.59f + b*0.11f);
1159     }
1160 
set_sat(skvm::Builder * p,skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 s)1161     static void set_sat(skvm::Builder* p, skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 s) {
1162         F32 mn  = min(*r, min(*g, *b)),
1163             mx  = max(*r, max(*g, *b)),
1164             sat = mx - mn;
1165 
1166         // Map min channel to 0, max channel to s, and scale the middle proportionally.
1167         auto scale = [&](auto c) {
1168             // TODO: better to divide and check for non-finite result?
1169             return select(sat == 0.0f, 0.0f
1170                                      , ((c - mn) * s) / sat);
1171         };
1172         *r = scale(*r);
1173         *g = scale(*g);
1174         *b = scale(*b);
1175     }
1176 
set_lum(skvm::Builder * p,skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 lu)1177     static void set_lum(skvm::Builder* p, skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 lu) {
1178         auto diff = lu - luminance(p, *r, *g, *b);
1179         *r += diff;
1180         *g += diff;
1181         *b += diff;
1182     }
1183 
clip_color(skvm::Builder * p,skvm::F32 * r,skvm::F32 * g,skvm::F32 * b,skvm::F32 a)1184     static void clip_color(skvm::Builder* p,
1185                            skvm::F32* r, skvm::F32* g, skvm::F32* b, skvm::F32 a) {
1186         F32 mn  = min(*r, min(*g, *b)),
1187             mx  = max(*r, max(*g, *b)),
1188             lu = luminance(p, *r, *g, *b);
1189 
1190         auto clip = [&](auto c) {
1191             c = select(mn >= 0, c
1192                               , lu + ((c-lu)*(  lu)) / (lu-mn));
1193             c = select(mx >  a, lu + ((c-lu)*(a-lu)) / (mx-lu)
1194                               , c);
1195             // Sometimes without this we may dip just a little negative.
1196             return max(c, 0.0f);
1197         };
1198         *r = clip(*r);
1199         *g = clip(*g);
1200         *b = clip(*b);
1201     }
1202 
blend(SkBlendMode mode,Color src,Color dst)1203     Color Builder::blend(SkBlendMode mode, Color src, Color dst) {
1204         auto mma = [this](skvm::F32 x, skvm::F32 y, skvm::F32 z, skvm::F32 w) {
1205             return mad(x,y, mul(z,w));
1206         };
1207 
1208         auto two = [this](skvm::F32 x) { return add(x, x); };
1209 
1210         auto apply_rgba = [&](auto&& fn) {
1211             return Color {
1212                 fn(src.r, dst.r),
1213                 fn(src.g, dst.g),
1214                 fn(src.b, dst.b),
1215                 fn(src.a, dst.a),
1216             };
1217         };
1218 
1219         auto apply_rgb_srcover_a = [&](auto&& fn) {
1220             return Color {
1221                 fn(src.r, dst.r),
1222                 fn(src.g, dst.g),
1223                 fn(src.b, dst.b),
1224                 mad(dst.a, 1-src.a, src.a),   // srcover for alpha
1225             };
1226         };
1227 
1228         auto non_sep = [&](auto R, auto G, auto B) {
1229             return Color{
1230                 add(mma(src.r, 1-dst.a,  dst.r, 1-src.a), R),
1231                 add(mma(src.g, 1-dst.a,  dst.g, 1-src.a), G),
1232                 add(mma(src.b, 1-dst.a,  dst.b, 1-src.a), B),
1233                 mad(dst.a,1-src.a, src.a),  // srcover
1234             };
1235         };
1236 
1237         switch (mode) {
1238             default: SkASSERT(false); /*but also, for safety, fallthrough*/
1239 
1240             case SkBlendMode::kClear: return { splat(0.0f), splat(0.0f), splat(0.0f), splat(0.0f) };
1241 
1242             case SkBlendMode::kSrc: return src;
1243             case SkBlendMode::kDst: return dst;
1244 
1245             case SkBlendMode::kDstOver: std::swap(src, dst); // fall-through
1246             case SkBlendMode::kSrcOver:
1247                 return apply_rgba([&](auto s, auto d) {
1248                     return this->mad(d,1-src.a, s);
1249                 });
1250 
1251             case SkBlendMode::kDstIn: std::swap(src, dst); // fall-through
1252             case SkBlendMode::kSrcIn:
1253                 return apply_rgba([&](auto s, auto d) {
1254                     return this->mul(s, dst.a);
1255                 });
1256 
1257             case SkBlendMode::kDstOut: std::swap(src, dst); // fall-through
1258             case SkBlendMode::kSrcOut:
1259                 return apply_rgba([&](auto s, auto d) {
1260                     return this->mul(s, 1-dst.a);
1261                 });
1262 
1263             case SkBlendMode::kDstATop: std::swap(src, dst); // fall-through
1264             case SkBlendMode::kSrcATop:
1265                 return apply_rgba([&](auto s, auto d) {
1266                     return mma(s, dst.a,  d, 1-src.a);
1267                 });
1268 
1269             case SkBlendMode::kXor:
1270                 return apply_rgba([&](auto s, auto d) {
1271                     return mma(s, 1-dst.a,  d, 1-src.a);
1272                 });
1273 
1274             case SkBlendMode::kPlus:
1275                 return apply_rgba([&](auto s, auto d) {
1276                     return this->min(add(s, d), 1.0f);
1277                 });
1278 
1279             case SkBlendMode::kModulate:
1280                 return apply_rgba([&](auto s, auto d) {
1281                     return this->mul(s, d);
1282                 });
1283 
1284             case SkBlendMode::kScreen:
1285                 // (s+d)-(s*d) gave us trouble with our "r,g,b <= after blending" asserts.
1286                 // It's kind of plausible that s + (d - sd) keeps more precision?
1287                 return apply_rgba([&](auto s, auto d) {
1288                     return this->add(s, this->sub(d, this->mul(s, d)));
1289                 });
1290 
1291             case SkBlendMode::kDarken:
1292                 return apply_rgb_srcover_a([&](auto s, auto d) {
1293                     return this->add(s, this->sub(d, this->max(this->mul(s, dst.a),
1294                                              this->mul(d, src.a))));
1295                 });
1296 
1297             case SkBlendMode::kLighten:
1298                 return apply_rgb_srcover_a([&](auto s, auto d) {
1299                     return this->add(s, this->sub(d, this->min(this->mul(s, dst.a),
1300                                              this->mul(d, src.a))));
1301                 });
1302 
1303             case SkBlendMode::kDifference:
1304                 return apply_rgb_srcover_a([&](auto s, auto d) {
1305                     return this->add(s, this->sub(d, two(this->min(this->mul(s, dst.a),
1306                                                  this->mul(d, src.a)))));
1307                 });
1308 
1309             case SkBlendMode::kExclusion:
1310                 return apply_rgb_srcover_a([&](auto s, auto d) {
1311                     return this->add(s, this->sub(d, two(this->mul(s, d))));
1312                 });
1313 
1314             case SkBlendMode::kColorBurn:
1315                 return apply_rgb_srcover_a([&](auto s, auto d) {
1316                     // TODO: divide and check for non-finite result instead of checking for s == 0.
1317                     auto mn   = this->min(dst.a,
1318                                     this->div(this->mul(this->sub(dst.a, d), src.a), s)),
1319                          burn = this->mad(src.a, this->sub(dst.a, mn), mma(s, 1-dst.a, d, 1-src.a));
1320                     return select(eq(d, dst.a), this->mad(s, 1-dst.a, d),
1321                            select(eq(s,  0.0f), this->mul(d, 1-src.a)
1322                                               , burn));
1323                 });
1324 
1325             case SkBlendMode::kColorDodge:
1326                 return apply_rgb_srcover_a([&](auto s, auto d) {
1327                     // TODO: divide and check for non-finite result instead of checking for s == sa.
1328                     auto dodge = this->mad(src.a, this->min(dst.a,
1329                                                 this->div(this->mul(d, src.a), this->sub(src.a, s))),
1330                                      mma(s, 1-dst.a, d, 1-src.a));
1331                     return select(eq(d,  0.0f), mul(s, 1-dst.a),
1332                            select(eq(s, src.a), mad(d, 1-src.a, s)
1333                                               , dodge));
1334                 });
1335 
1336             case SkBlendMode::kHardLight:
1337                 return apply_rgb_srcover_a([&](auto s, auto d) {
1338                     return add(mma(s, 1-dst.a, d, 1-src.a),
1339                                select(lte(two(s), src.a),
1340                                       two(mul(s, d)),
1341                                       sub(mul(src.a, dst.a), two(mul(sub(dst.a, d), sub(src.a, s))))));
1342                 });
1343 
1344             case SkBlendMode::kOverlay:
1345                 return apply_rgb_srcover_a([&](auto s, auto d) {
1346                     return add(mma(s, 1-dst.a, d, 1-src.a),
1347                                select(lte(two(d), dst.a),
1348                                       two(mul(s, d)),
1349                                       sub(mul(src.a, dst.a), two(mul(sub(dst.a, d), sub(src.a, s))))));
1350                 });
1351 
1352             case SkBlendMode::kMultiply:
1353                 return apply_rgba([&](auto s, auto d) {
1354                     return this->add(mma(s, 1-dst.a, d, 1-src.a), this->mul(s, d));
1355                 });
1356 
1357             case SkBlendMode::kSoftLight:
1358                 return apply_rgb_srcover_a([&](auto s, auto d) {
1359                     auto  m = select(gt(dst.a, 0.0f), div(d, dst.a), 0.0f),
1360                          s2 = two(s),
1361                          m4 = two(two(m));
1362 
1363                          // The logic forks three ways:
1364                          //    1. dark src?
1365                          //    2. light src, dark dst?
1366                          //    3. light src, light dst?
1367 
1368                          // Used in case 1
1369                     auto darkSrc = mul(d, mad(sub(s2, src.a), 1-m, src.a)),
1370                          // Used in case 2
1371                          darkDst = mad(mad(m4, m4, m4), sub(m, 1.0f), mul(7.0f, m)),
1372                          // Used in case 3.
1373                          liteDst = sub(sqrt(m), m),
1374                          // Used in 2 or 3?
1375                          liteSrc = mad(mul(dst.a, sub(s2, src.a)),
1376                                        select(lte(two(two(d)), dst.a), darkDst, liteDst),
1377                                        mul(d, src.a));
1378                     return mad(s, 1-dst.a, mad(d,
1379                                                1-src.a,
1380                                                select(lte(s2, src.a), darkSrc, liteSrc)));
1381 
1382 
1383                 });
1384 
1385             case SkBlendMode::kHue: {
1386                 skvm::F32 R = mul(src.r, src.a),
1387                           G = mul(src.g, src.a),
1388                           B = mul(src.b, src.a);
1389 
1390                 set_sat(this, &R, &G, &B, mul(saturation(this, dst.r, dst.g, dst.b), src.a));
1391                 set_lum(this, &R, &G, &B, mul( luminance(this, dst.r, dst.g, dst.b), src.a));
1392                 clip_color(this, &R, &G, &B, mul(src.a, dst.a));
1393 
1394                 return non_sep(R, G, B);
1395             }
1396 
1397             case SkBlendMode::kSaturation: {
1398                 skvm::F32 R = mul(dst.r, src.a),
1399                           G = mul(dst.g, src.a),
1400                           B = mul(dst.b, src.a);
1401 
1402                 set_sat(this, &R, &G, &B, mul(saturation(this, src.r, src.g, src.b), dst.a));
1403                 set_lum(this, &R, &G, &B, mul( luminance(this, dst.r, dst.g, dst.b), src.a));
1404                 clip_color(this, &R, &G, &B, mul(src.a, dst.a));
1405 
1406                 return non_sep(R, G, B);
1407             }
1408 
1409             case SkBlendMode::kColor: {
1410                 skvm::F32 R = mul(src.r, dst.a),
1411                           G = mul(src.g, dst.a),
1412                           B = mul(src.b, dst.a);
1413 
1414                 set_lum(this, &R, &G, &B, mul(luminance(this, dst.r, dst.g, dst.b), src.a));
1415                 clip_color(this, &R, &G, &B, mul(src.a, dst.a));
1416 
1417                 return non_sep(R, G, B);
1418             }
1419 
1420             case SkBlendMode::kLuminosity: {
1421                 skvm::F32 R = mul(dst.r, src.a),
1422                           G = mul(dst.g, src.a),
1423                           B = mul(dst.b, src.a);
1424 
1425                 set_lum(this, &R, &G, &B, mul(luminance(this, src.r, src.g, src.b), dst.a));
1426                 clip_color(this, &R, &G, &B, mul(src.a, dst.a));
1427 
1428                 return non_sep(R, G, B);
1429             }
1430         }
1431     }
1432 
1433     // Fill live and sinks each if non-null:
1434     //    - (*live)[id]: notes whether each input instruction is live
1435     //    - *sinks: an unsorted set of live instructions with side effects (stores, assert_true)
1436     // Returns the number of live instructions.
liveness_analysis(const std::vector<Instruction> & instructions,std::vector<bool> * live,std::vector<Val> * sinks)1437     int liveness_analysis(const std::vector<Instruction>& instructions,
1438                           std::vector<bool>* live,
1439                           std::vector<Val>*  sinks) {
1440         int instruction_count = instructions.size();
1441         live->resize(instruction_count, false);
1442         int liveInstructionCount = 0;
1443         auto trace = [&](Val id, auto& recurse) -> void {
1444           if (!(*live)[id]) {
1445               (*live)[id] = true;
1446               liveInstructionCount++;
1447               Instruction inst = instructions[id];
1448               if (inst.x != NA) { recurse(inst.x, recurse); }
1449               if (inst.y != NA) { recurse(inst.y, recurse); }
1450               if (inst.z != NA) { recurse(inst.z, recurse); }
1451           }
1452         };
1453 
1454         // For all the sink instructions.
1455         for (Val id = 0; id < instruction_count; id++) {
1456             if (instructions[id].op <= skvm::Op::store32) {
1457                 sinks->push_back(id);
1458                 trace(id, trace);
1459             }
1460         }
1461         return liveInstructionCount;
1462     }
1463 
1464     // For a given program we'll store each Instruction's users contiguously in a table,
1465     // and track where each Instruction's span of users starts and ends in another index.
1466     // Here's a simple program that loads x and stores kx+k:
1467     //
1468     //  v0 = splat(k)
1469     //  v1 = load(...)
1470     //  v2 = mul(v1, v0)
1471     //  v3 = add(v2, v0)
1472     //  v4 = store(..., v3)
1473     //
1474     // This program has 5 instructions v0-v4.
1475     //    - v0 is used by v2 and v3
1476     //    - v1 is used by v2
1477     //    - v2 is used by v3
1478     //    - v3 is used by v4
1479     //    - v4 has a side-effect
1480     //
1481     // For this program we fill out these two arrays:
1482     //     table:  [v2,v3, v2, v3, v4]
1483     //     index:  [0,     2,  3,  4,  5]
1484     //
1485     // The table is just those "is used by ..." I wrote out above in order,
1486     // and the index tracks where an Instruction's span of users starts, table[index[id]].
1487     // The span continues up until the start of the next Instruction, table[index[id+1]].
users(Val id) const1488     SkSpan<const Val> Usage::users(Val id) const {
1489         int begin = fIndex[id];
1490         int end   = fIndex[id + 1];
1491         return SkMakeSpan(fTable.data() + begin, end - begin);
1492     }
1493 
Usage(const std::vector<Instruction> & program,const std::vector<bool> & live)1494     Usage::Usage(const std::vector<Instruction>& program, const std::vector<bool>& live) {
1495         // uses[id] counts the number of times each Instruction is used.
1496         std::vector<int> uses(program.size(), 0);
1497         for (Val id = 0; id < (Val)program.size(); id++) {
1498             if (live[id]) {
1499                 Instruction inst = program[id];
1500                 if (inst.x != NA) { ++uses[inst.x]; }
1501                 if (inst.y != NA) { ++uses[inst.y]; }
1502                 if (inst.z != NA) { ++uses[inst.z]; }
1503             }
1504         }
1505 
1506         // Build our index into fTable, with an extra entry marking the final Instruction's end.
1507         fIndex.reserve(program.size() + 1);
1508         int total_uses = 0;
1509         for (int n : uses) {
1510             fIndex.push_back(total_uses);
1511             total_uses += n;
1512         }
1513         fIndex.push_back(total_uses);
1514 
1515         // Tick down each Instruction's uses to fill in fTable.
1516         fTable.resize(total_uses, NA);
1517         for (Val id = (Val)program.size(); id --> 0; ) {
1518             if (live[id]) {
1519                 Instruction inst = program[id];
1520                 if (inst.x != NA) { fTable[fIndex[inst.x] + --uses[inst.x]] = id; }
1521                 if (inst.y != NA) { fTable[fIndex[inst.y] + --uses[inst.y]] = id; }
1522                 if (inst.z != NA) { fTable[fIndex[inst.z] + --uses[inst.z]] = id; }
1523             }
1524         }
1525         for (int n  : uses  ) { (void)n;  SkASSERT(n  == 0 ); }
1526         for (Val id : fTable) { (void)id; SkASSERT(id != NA); }
1527     }
1528 
1529     // ~~~~ Program::eval() and co. ~~~~ //
1530 
1531     // Handy references for x86-64 instruction encoding:
1532     // https://wiki.osdev.org/X86-64_Instruction_Encoding
1533     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm
1534     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm
1535     // http://ref.x86asm.net/coder64.html
1536 
1537     // Used for ModRM / immediate instruction encoding.
_233(int a,int b,int c)1538     static uint8_t _233(int a, int b, int c) {
1539         return (a & 3) << 6
1540              | (b & 7) << 3
1541              | (c & 7) << 0;
1542     }
1543 
1544     // ModRM byte encodes the arguments of an opcode.
1545     enum class Mod { Indirect, OneByteImm, FourByteImm, Direct };
mod_rm(Mod mod,int reg,int rm)1546     static uint8_t mod_rm(Mod mod, int reg, int rm) {
1547         return _233((int)mod, reg, rm);
1548     }
1549 
mod(int imm)1550     static Mod mod(int imm) {
1551         if (imm == 0)               { return Mod::Indirect; }
1552         if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; }
1553         return Mod::FourByteImm;
1554     }
1555 
imm_bytes(Mod mod)1556     static int imm_bytes(Mod mod) {
1557         switch (mod) {
1558             case Mod::Indirect:    return 0;
1559             case Mod::OneByteImm:  return 1;
1560             case Mod::FourByteImm: return 4;
1561             case Mod::Direct: SkUNREACHABLE;
1562         }
1563         SkUNREACHABLE;
1564     }
1565 
1566     // SIB byte encodes a memory address, base + (index * scale).
sib(Assembler::Scale scale,int index,int base)1567     static uint8_t sib(Assembler::Scale scale, int index, int base) {
1568         return _233((int)scale, index, base);
1569     }
1570 
1571     // The REX prefix is used to extend most old 32-bit instructions to 64-bit.
rex(bool W,bool R,bool X,bool B)1572     static uint8_t rex(bool W,   // If set, operation is 64-bit, otherwise default, usually 32-bit.
1573                        bool R,   // Extra top bit to select ModRM reg, registers 8-15.
1574                        bool X,   // Extra top bit for SIB index register.
1575                        bool B) { // Extra top bit for SIB base or ModRM rm register.
1576         return 0b01000000   // Fixed 0100 for top four bits.
1577              | (W << 3)
1578              | (R << 2)
1579              | (X << 1)
1580              | (B << 0);
1581     }
1582 
1583 
1584     // The VEX prefix extends SSE operations to AVX.  Used generally, even with XMM.
1585     struct VEX {
1586         int     len;
1587         uint8_t bytes[3];
1588     };
1589 
vex(bool WE,bool R,bool X,bool B,int map,int vvvv,bool L,int pp)1590     static VEX vex(bool  WE,   // Like REX W for int operations, or opcode extension for float?
1591                    bool   R,   // Same as REX R.  Pass high bit of dst register, dst>>3.
1592                    bool   X,   // Same as REX X.
1593                    bool   B,   // Same as REX B.  Pass y>>3 for 3-arg ops, x>>3 for 2-arg.
1594                    int  map,   // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f.
1595                    int vvvv,   // 4-bit second operand register.  Pass our x for 3-arg ops.
1596                    bool   L,   // Set for 256-bit ymm operations, off for 128-bit xmm.
1597                    int   pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none.
1598 
1599         // Pack x86 opcode map selector to 5-bit VEX encoding.
1600         map = [map]{
1601             switch (map) {
1602                 case   0x0f: return 0b00001;
1603                 case 0x380f: return 0b00010;
1604                 case 0x3a0f: return 0b00011;
1605                 // Several more cases only used by XOP / TBM.
1606             }
1607             SkUNREACHABLE;
1608         }();
1609 
1610         // Pack  mandatory SSE opcode prefix byte to 2-bit VEX encoding.
1611         pp = [pp]{
1612             switch (pp) {
1613                 case 0x66: return 0b01;
1614                 case 0xf3: return 0b10;
1615                 case 0xf2: return 0b11;
1616             }
1617             return 0b00;
1618         }();
1619 
1620         VEX vex = {0, {0,0,0}};
1621         if (X == 0 && B == 0 && WE == 0 && map == 0b00001) {
1622             // With these conditions met, we can optionally compress VEX to 2-byte.
1623             vex.len = 2;
1624             vex.bytes[0] = 0xc5;
1625             vex.bytes[1] = (pp      &  3) << 0
1626                          | (L       &  1) << 2
1627                          | (~vvvv   & 15) << 3
1628                          | (~(int)R &  1) << 7;
1629         } else {
1630             // We could use this 3-byte VEX prefix all the time if we like.
1631             vex.len = 3;
1632             vex.bytes[0] = 0xc4;
1633             vex.bytes[1] = (map     & 31) << 0
1634                          | (~(int)B &  1) << 5
1635                          | (~(int)X &  1) << 6
1636                          | (~(int)R &  1) << 7;
1637             vex.bytes[2] = (pp    &  3) << 0
1638                          | (L     &  1) << 2
1639                          | (~vvvv & 15) << 3
1640                          | (WE    &  1) << 7;
1641         }
1642         return vex;
1643     }
1644 
Assembler(void * buf)1645     Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fCurr(fCode), fSize(0) {}
1646 
size() const1647     size_t Assembler::size() const { return fSize; }
1648 
bytes(const void * p,int n)1649     void Assembler::bytes(const void* p, int n) {
1650         if (fCurr) {
1651             memcpy(fCurr, p, n);
1652             fCurr += n;
1653         }
1654         fSize += n;
1655     }
1656 
byte(uint8_t b)1657     void Assembler::byte(uint8_t b) { this->bytes(&b, 1); }
word(uint32_t w)1658     void Assembler::word(uint32_t w) { this->bytes(&w, 4); }
1659 
align(int mod)1660     void Assembler::align(int mod) {
1661         while (this->size() % mod) {
1662             this->byte(0x00);
1663         }
1664     }
1665 
int3()1666     void Assembler::int3() {
1667         this->byte(0xcc);
1668     }
1669 
vzeroupper()1670     void Assembler::vzeroupper() {
1671         this->byte(0xc5);
1672         this->byte(0xf8);
1673         this->byte(0x77);
1674     }
ret()1675     void Assembler::ret() { this->byte(0xc3); }
1676 
1677     // Common instruction building for 64-bit opcodes with an immediate argument.
op(int opcode,int opcode_ext,GP64 dst,int imm)1678     void Assembler::op(int opcode, int opcode_ext, GP64 dst, int imm) {
1679         opcode |= 0b0000'0001;   // low bit set for 64-bit operands
1680         opcode |= 0b1000'0000;   // top bit set for instructions with any immediate
1681 
1682         int imm_bytes = 4;
1683         if (SkTFitsIn<int8_t>(imm)) {
1684             imm_bytes = 1;
1685             opcode |= 0b0000'0010;  // second bit set for 8-bit immediate, else 32-bit.
1686         }
1687 
1688         this->byte(rex(1,0,0,dst>>3));
1689         this->byte(opcode);
1690         this->byte(mod_rm(Mod::Direct, opcode_ext, dst&7));
1691         this->bytes(&imm, imm_bytes);
1692     }
1693 
add(GP64 dst,int imm)1694     void Assembler::add(GP64 dst, int imm) { this->op(0,0b000, dst,imm); }
sub(GP64 dst,int imm)1695     void Assembler::sub(GP64 dst, int imm) { this->op(0,0b101, dst,imm); }
cmp(GP64 reg,int imm)1696     void Assembler::cmp(GP64 reg, int imm) { this->op(0,0b111, reg,imm); }
1697 
movq(GP64 dst,GP64 src,int off)1698     void Assembler::movq(GP64 dst, GP64 src, int off) {
1699         this->byte(rex(1,dst>>3,0,src>>3));
1700         this->byte(0x8b);
1701         this->byte(mod_rm(mod(off), dst&7, src&7));
1702         this->bytes(&off, imm_bytes(mod(off)));
1703     }
1704 
op(int prefix,int map,int opcode,Ymm dst,Ymm x,Ymm y,bool W)1705     void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Ymm y, bool W/*=false*/) {
1706         VEX v = vex(W, dst>>3, 0, y>>3,
1707                     map, x, 1/*ymm, not xmm*/, prefix);
1708         this->bytes(v.bytes, v.len);
1709         this->byte(opcode);
1710         this->byte(mod_rm(Mod::Direct, dst&7, y&7));
1711     }
1712 
vpaddd(Ymm dst,Ymm x,YmmOrLabel y)1713     void Assembler::vpaddd (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,  0x0f,0xfe, dst,x,y); }
vpsubd(Ymm dst,Ymm x,YmmOrLabel y)1714     void Assembler::vpsubd (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,  0x0f,0xfa, dst,x,y); }
vpmulld(Ymm dst,Ymm x,Ymm y)1715     void Assembler::vpmulld(Ymm dst, Ymm x, Ymm        y) { this->op(0x66,0x380f,0x40, dst,x,y); }
1716 
vpsubw(Ymm dst,Ymm x,Ymm y)1717     void Assembler::vpsubw (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xf9, dst,x,y); }
vpmullw(Ymm dst,Ymm x,Ymm y)1718     void Assembler::vpmullw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xd5, dst,x,y); }
1719 
vpand(Ymm dst,Ymm x,YmmOrLabel y)1720     void Assembler::vpand (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,0x0f,0xdb, dst,x,y); }
vpor(Ymm dst,Ymm x,YmmOrLabel y)1721     void Assembler::vpor  (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,0x0f,0xeb, dst,x,y); }
vpxor(Ymm dst,Ymm x,YmmOrLabel y)1722     void Assembler::vpxor (Ymm dst, Ymm x, YmmOrLabel y) { this->op(0x66,0x0f,0xef, dst,x,y); }
vpandn(Ymm dst,Ymm x,Ymm y)1723     void Assembler::vpandn(Ymm dst, Ymm x, Ymm        y) { this->op(0x66,0x0f,0xdf, dst,x,y); }
1724 
vaddps(Ymm dst,Ymm x,YmmOrLabel y)1725     void Assembler::vaddps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x58, dst,x,y); }
vsubps(Ymm dst,Ymm x,YmmOrLabel y)1726     void Assembler::vsubps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x5c, dst,x,y); }
vmulps(Ymm dst,Ymm x,YmmOrLabel y)1727     void Assembler::vmulps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x59, dst,x,y); }
vdivps(Ymm dst,Ymm x,Ymm y)1728     void Assembler::vdivps(Ymm dst, Ymm x, Ymm        y) { this->op(0,0x0f,0x5e, dst,x,y); }
vminps(Ymm dst,Ymm x,YmmOrLabel y)1729     void Assembler::vminps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x5d, dst,x,y); }
vmaxps(Ymm dst,Ymm x,YmmOrLabel y)1730     void Assembler::vmaxps(Ymm dst, Ymm x, YmmOrLabel y) { this->op(0,0x0f,0x5f, dst,x,y); }
1731 
vfmadd132ps(Ymm dst,Ymm x,Ymm y)1732     void Assembler::vfmadd132ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x98, dst,x,y); }
vfmadd213ps(Ymm dst,Ymm x,Ymm y)1733     void Assembler::vfmadd213ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xa8, dst,x,y); }
vfmadd231ps(Ymm dst,Ymm x,Ymm y)1734     void Assembler::vfmadd231ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xb8, dst,x,y); }
1735 
vfmsub132ps(Ymm dst,Ymm x,Ymm y)1736     void Assembler::vfmsub132ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x9a, dst,x,y); }
vfmsub213ps(Ymm dst,Ymm x,Ymm y)1737     void Assembler::vfmsub213ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xaa, dst,x,y); }
vfmsub231ps(Ymm dst,Ymm x,Ymm y)1738     void Assembler::vfmsub231ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xba, dst,x,y); }
1739 
vfnmadd132ps(Ymm dst,Ymm x,Ymm y)1740     void Assembler::vfnmadd132ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x9c, dst,x,y); }
vfnmadd213ps(Ymm dst,Ymm x,Ymm y)1741     void Assembler::vfnmadd213ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xac, dst,x,y); }
vfnmadd231ps(Ymm dst,Ymm x,Ymm y)1742     void Assembler::vfnmadd231ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xbc, dst,x,y); }
1743 
vpackusdw(Ymm dst,Ymm x,Ymm y)1744     void Assembler::vpackusdw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x2b, dst,x,y); }
vpackuswb(Ymm dst,Ymm x,Ymm y)1745     void Assembler::vpackuswb(Ymm dst, Ymm x, Ymm y) { this->op(0x66,  0x0f,0x67, dst,x,y); }
1746 
vpcmpeqd(Ymm dst,Ymm x,Ymm y)1747     void Assembler::vpcmpeqd(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0x76, dst,x,y); }
vpcmpgtd(Ymm dst,Ymm x,Ymm y)1748     void Assembler::vpcmpgtd(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0x66, dst,x,y); }
1749 
vcmpps(Ymm dst,Ymm x,Ymm y,int imm)1750     void Assembler::vcmpps(Ymm dst, Ymm x, Ymm y, int imm) {
1751         this->op(0,0x0f,0xc2, dst,x,y);
1752         this->byte(imm);
1753     }
1754 
vpblendvb(Ymm dst,Ymm x,Ymm y,Ymm z)1755     void Assembler::vpblendvb(Ymm dst, Ymm x, Ymm y, Ymm z) {
1756         int prefix = 0x66,
1757             map    = 0x3a0f,
1758             opcode = 0x4c;
1759         VEX v = vex(0, dst>>3, 0, y>>3,
1760                     map, x, /*ymm?*/1, prefix);
1761         this->bytes(v.bytes, v.len);
1762         this->byte(opcode);
1763         this->byte(mod_rm(Mod::Direct, dst&7, y&7));
1764         this->byte(z << 4);
1765     }
1766 
1767     // dst = x op /opcode_ext imm
op(int prefix,int map,int opcode,int opcode_ext,Ymm dst,Ymm x,int imm)1768     void Assembler::op(int prefix, int map, int opcode, int opcode_ext, Ymm dst, Ymm x, int imm) {
1769         // This is a little weird, but if we pass the opcode_ext as if it were the dst register,
1770         // the dst register as if x, and the x register as if y, all the bits end up where we want.
1771         this->op(prefix, map, opcode, (Ymm)opcode_ext,dst,x);
1772         this->byte(imm);
1773     }
1774 
vpslld(Ymm dst,Ymm x,int imm)1775     void Assembler::vpslld(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,6, dst,x,imm); }
vpsrld(Ymm dst,Ymm x,int imm)1776     void Assembler::vpsrld(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,2, dst,x,imm); }
vpsrad(Ymm dst,Ymm x,int imm)1777     void Assembler::vpsrad(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,4, dst,x,imm); }
1778 
vpsrlw(Ymm dst,Ymm x,int imm)1779     void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x71,2, dst,x,imm); }
1780 
1781 
vpermq(Ymm dst,Ymm x,int imm)1782     void Assembler::vpermq(Ymm dst, Ymm x, int imm) {
1783         // A bit unusual among the instructions we use, this is 64-bit operation, so we set W.
1784         bool W = true;
1785         this->op(0x66,0x3a0f,0x00, dst,x,W);
1786         this->byte(imm);
1787     }
1788 
vroundps(Ymm dst,Ymm x,int imm)1789     void Assembler::vroundps(Ymm dst, Ymm x, int imm) {
1790         this->op(0x66,0x3a0f,0x08, dst,x);
1791         this->byte(imm);
1792     }
1793 
vmovdqa(Ymm dst,Ymm src)1794     void Assembler::vmovdqa(Ymm dst, Ymm src) { this->op(0x66,0x0f,0x6f, dst,src); }
1795 
vcvtdq2ps(Ymm dst,Ymm x)1796     void Assembler::vcvtdq2ps (Ymm dst, Ymm x) { this->op(   0,0x0f,0x5b, dst,x); }
vcvttps2dq(Ymm dst,Ymm x)1797     void Assembler::vcvttps2dq(Ymm dst, Ymm x) { this->op(0xf3,0x0f,0x5b, dst,x); }
vcvtps2dq(Ymm dst,Ymm x)1798     void Assembler::vcvtps2dq (Ymm dst, Ymm x) { this->op(0x66,0x0f,0x5b, dst,x); }
vsqrtps(Ymm dst,Ymm x)1799     void Assembler::vsqrtps   (Ymm dst, Ymm x) { this->op(   0,0x0f,0x51, dst,x); }
1800 
here()1801     Assembler::Label Assembler::here() {
1802         return { (int)this->size(), Label::NotYetSet, {} };
1803     }
1804 
disp19(Label * l)1805     int Assembler::disp19(Label* l) {
1806         SkASSERT(l->kind == Label::NotYetSet ||
1807                  l->kind == Label::ARMDisp19);
1808         l->kind = Label::ARMDisp19;
1809         l->references.push_back(here().offset);
1810         // ARM 19-bit instruction count, from the beginning of this instruction.
1811         return (l->offset - here().offset) / 4;
1812     }
1813 
disp32(Label * l)1814     int Assembler::disp32(Label* l) {
1815         SkASSERT(l->kind == Label::NotYetSet ||
1816                  l->kind == Label::X86Disp32);
1817         l->kind = Label::X86Disp32;
1818         l->references.push_back(here().offset);
1819         // x86 32-bit byte count, from the end of this instruction.
1820         return l->offset - (here().offset + 4);
1821     }
1822 
op(int prefix,int map,int opcode,Ymm dst,Ymm x,Label * l)1823     void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Label* l) {
1824         // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13.
1825         const int rip = rbp;
1826 
1827         VEX v = vex(0, dst>>3, 0, rip>>3,
1828                     map, x, /*ymm?*/1, prefix);
1829         this->bytes(v.bytes, v.len);
1830         this->byte(opcode);
1831         this->byte(mod_rm(Mod::Indirect, dst&7, rip&7));
1832         this->word(this->disp32(l));
1833     }
1834 
op(int prefix,int map,int opcode,Ymm dst,Ymm x,YmmOrLabel y)1835     void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, YmmOrLabel y) {
1836         y.label ? this->op(prefix,map,opcode,dst,x, y.label)
1837                 : this->op(prefix,map,opcode,dst,x, y.ymm  );
1838     }
1839 
vpshufb(Ymm dst,Ymm x,Label * l)1840     void Assembler::vpshufb(Ymm dst, Ymm x, Label* l) { this->op(0x66,0x380f,0x00, dst,x,l); }
vptest(Ymm dst,Label * l)1841     void Assembler::vptest(Ymm dst, Label* l) { this->op(0x66, 0x380f, 0x17, dst, (Ymm)0, l); }
1842 
vbroadcastss(Ymm dst,Label * l)1843     void Assembler::vbroadcastss(Ymm dst, Label* l) { this->op(0x66,0x380f,0x18, dst, (Ymm)0, l); }
vbroadcastss(Ymm dst,Xmm src)1844     void Assembler::vbroadcastss(Ymm dst, Xmm src)  { this->op(0x66,0x380f,0x18, dst, (Ymm)src); }
vbroadcastss(Ymm dst,GP64 ptr,int off)1845     void Assembler::vbroadcastss(Ymm dst, GP64 ptr, int off) {
1846         int prefix = 0x66,
1847                map = 0x380f,
1848             opcode = 0x18;
1849         VEX v = vex(0, dst>>3, 0, ptr>>3,
1850                     map, 0, /*ymm?*/1, prefix);
1851         this->bytes(v.bytes, v.len);
1852         this->byte(opcode);
1853 
1854         this->byte(mod_rm(mod(off), dst&7, ptr&7));
1855         this->bytes(&off, imm_bytes(mod(off)));
1856     }
1857 
jump(uint8_t condition,Label * l)1858     void Assembler::jump(uint8_t condition, Label* l) {
1859         // These conditional jumps can be either 2 bytes (short) or 6 bytes (near):
1860         //    7?     one-byte-disp
1861         //    0F 8? four-byte-disp
1862         // We always use the near displacement to make updating labels simpler (no resizing).
1863         this->byte(0x0f);
1864         this->byte(condition);
1865         this->word(this->disp32(l));
1866     }
je(Label * l)1867     void Assembler::je (Label* l) { this->jump(0x84, l); }
jne(Label * l)1868     void Assembler::jne(Label* l) { this->jump(0x85, l); }
jl(Label * l)1869     void Assembler::jl (Label* l) { this->jump(0x8c, l); }
jc(Label * l)1870     void Assembler::jc (Label* l) { this->jump(0x82, l); }
1871 
jmp(Label * l)1872     void Assembler::jmp(Label* l) {
1873         // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit.
1874         this->byte(0xe9);
1875         this->word(this->disp32(l));
1876     }
1877 
load_store(int prefix,int map,int opcode,Ymm ymm,GP64 ptr)1878     void Assembler::load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr) {
1879         VEX v = vex(0, ymm>>3, 0, ptr>>3,
1880                     map, 0, /*ymm?*/1, prefix);
1881         this->bytes(v.bytes, v.len);
1882         this->byte(opcode);
1883         this->byte(mod_rm(Mod::Indirect, ymm&7, ptr&7));
1884     }
1885 
vmovups(Ymm dst,GP64 src)1886     void Assembler::vmovups  (Ymm dst, GP64 src) { this->load_store(0   ,  0x0f,0x10, dst,src); }
vpmovzxwd(Ymm dst,GP64 src)1887     void Assembler::vpmovzxwd(Ymm dst, GP64 src) { this->load_store(0x66,0x380f,0x33, dst,src); }
vpmovzxbd(Ymm dst,GP64 src)1888     void Assembler::vpmovzxbd(Ymm dst, GP64 src) { this->load_store(0x66,0x380f,0x31, dst,src); }
1889 
vmovups(GP64 dst,Ymm src)1890     void Assembler::vmovups  (GP64 dst, Ymm src) { this->load_store(0   ,  0x0f,0x11, src,dst); }
vmovups(GP64 dst,Xmm src)1891     void Assembler::vmovups  (GP64 dst, Xmm src) {
1892         // Same as vmovups(GP64,YMM) and load_store() except ymm? is 0.
1893         int prefix = 0,
1894             map    = 0x0f,
1895             opcode = 0x11;
1896         VEX v = vex(0, src>>3, 0, dst>>3,
1897                     map, 0, /*ymm?*/0, prefix);
1898         this->bytes(v.bytes, v.len);
1899         this->byte(opcode);
1900         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
1901     }
1902 
vmovq(GP64 dst,Xmm src)1903     void Assembler::vmovq(GP64 dst, Xmm src) {
1904         int prefix = 0x66,
1905             map    = 0x0f,
1906             opcode = 0xd6;
1907         VEX v = vex(0, src>>3, 0, dst>>3,
1908                     map, 0, /*ymm?*/0, prefix);
1909         this->bytes(v.bytes, v.len);
1910         this->byte(opcode);
1911         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
1912     }
1913 
vmovd(GP64 dst,Xmm src)1914     void Assembler::vmovd(GP64 dst, Xmm src) {
1915         int prefix = 0x66,
1916             map    = 0x0f,
1917             opcode = 0x7e;
1918         VEX v = vex(0, src>>3, 0, dst>>3,
1919                     map, 0, /*ymm?*/0, prefix);
1920         this->bytes(v.bytes, v.len);
1921         this->byte(opcode);
1922         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
1923     }
1924 
vmovd_direct(GP64 dst,Xmm src)1925     void Assembler::vmovd_direct(GP64 dst, Xmm src) {
1926         int prefix = 0x66,
1927             map    = 0x0f,
1928             opcode = 0x7e;
1929         VEX v = vex(0, src>>3, 0, dst>>3,
1930                     map, 0, /*ymm?*/0, prefix);
1931         this->bytes(v.bytes, v.len);
1932         this->byte(opcode);
1933         this->byte(mod_rm(Mod::Direct, src&7, dst&7));
1934     }
1935 
vmovd(Xmm dst,GP64 src)1936     void Assembler::vmovd(Xmm dst, GP64 src) {
1937         int prefix = 0x66,
1938             map    = 0x0f,
1939             opcode = 0x6e;
1940         VEX v = vex(0, dst>>3, 0, src>>3,
1941                     map, 0, /*ymm?*/0, prefix);
1942         this->bytes(v.bytes, v.len);
1943         this->byte(opcode);
1944         this->byte(mod_rm(Mod::Indirect, dst&7, src&7));
1945     }
1946 
vmovd(Xmm dst,Scale scale,GP64 index,GP64 base)1947     void Assembler::vmovd(Xmm dst, Scale scale, GP64 index, GP64 base) {
1948         int prefix = 0x66,
1949             map    = 0x0f,
1950             opcode = 0x6e;
1951         VEX v = vex(0, dst>>3, index>>3, base>>3,
1952                     map, 0, /*ymm?*/0, prefix);
1953         this->bytes(v.bytes, v.len);
1954         this->byte(opcode);
1955         this->byte(mod_rm(Mod::Indirect, dst&7, rsp));
1956         this->byte(sib(scale, index&7, base&7));
1957     }
1958 
vmovd_direct(Xmm dst,GP64 src)1959     void Assembler::vmovd_direct(Xmm dst, GP64 src) {
1960         int prefix = 0x66,
1961             map    = 0x0f,
1962             opcode = 0x6e;
1963         VEX v = vex(0, dst>>3, 0, src>>3,
1964                     map, 0, /*ymm?*/0, prefix);
1965         this->bytes(v.bytes, v.len);
1966         this->byte(opcode);
1967         this->byte(mod_rm(Mod::Direct, dst&7, src&7));
1968     }
1969 
movzbl(GP64 dst,GP64 src,int off)1970     void Assembler::movzbl(GP64 dst, GP64 src, int off) {
1971         if ((dst>>3) || (src>>3)) {
1972             this->byte(rex(0,dst>>3,0,src>>3));
1973         }
1974         this->byte(0x0f);
1975         this->byte(0xb6);
1976         this->byte(mod_rm(mod(off), dst&7, src&7));
1977         this->bytes(&off, imm_bytes(mod(off)));
1978     }
1979 
1980 
movb(GP64 dst,GP64 src)1981     void Assembler::movb(GP64 dst, GP64 src) {
1982         if ((dst>>3) || (src>>3)) {
1983             this->byte(rex(0,src>>3,0,dst>>3));
1984         }
1985         this->byte(0x88);
1986         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
1987     }
1988 
vpinsrw(Xmm dst,Xmm src,GP64 ptr,int imm)1989     void Assembler::vpinsrw(Xmm dst, Xmm src, GP64 ptr, int imm) {
1990         int prefix = 0x66,
1991             map    = 0x0f,
1992             opcode = 0xc4;
1993         VEX v = vex(0, dst>>3, 0, ptr>>3,
1994                     map, src, /*ymm?*/0, prefix);
1995         this->bytes(v.bytes, v.len);
1996         this->byte(opcode);
1997         this->byte(mod_rm(Mod::Indirect, dst&7, ptr&7));
1998         this->byte(imm);
1999     }
2000 
vpinsrb(Xmm dst,Xmm src,GP64 ptr,int imm)2001     void Assembler::vpinsrb(Xmm dst, Xmm src, GP64 ptr, int imm) {
2002         int prefix = 0x66,
2003             map    = 0x3a0f,
2004             opcode = 0x20;
2005         VEX v = vex(0, dst>>3, 0, ptr>>3,
2006                     map, src, /*ymm?*/0, prefix);
2007         this->bytes(v.bytes, v.len);
2008         this->byte(opcode);
2009         this->byte(mod_rm(Mod::Indirect, dst&7, ptr&7));
2010         this->byte(imm);
2011     }
2012 
vpextrw(GP64 ptr,Xmm src,int imm)2013     void Assembler::vpextrw(GP64 ptr, Xmm src, int imm) {
2014         int prefix = 0x66,
2015             map    = 0x3a0f,
2016             opcode = 0x15;
2017 
2018         VEX v = vex(0, src>>3, 0, ptr>>3,
2019                     map, 0, /*ymm?*/0, prefix);
2020         this->bytes(v.bytes, v.len);
2021         this->byte(opcode);
2022         this->byte(mod_rm(Mod::Indirect, src&7, ptr&7));
2023         this->byte(imm);
2024     }
vpextrb(GP64 ptr,Xmm src,int imm)2025     void Assembler::vpextrb(GP64 ptr, Xmm src, int imm) {
2026         int prefix = 0x66,
2027             map    = 0x3a0f,
2028             opcode = 0x14;
2029 
2030         VEX v = vex(0, src>>3, 0, ptr>>3,
2031                     map, 0, /*ymm?*/0, prefix);
2032         this->bytes(v.bytes, v.len);
2033         this->byte(opcode);
2034         this->byte(mod_rm(Mod::Indirect, src&7, ptr&7));
2035         this->byte(imm);
2036     }
2037 
vgatherdps(Ymm dst,Scale scale,Ymm ix,GP64 base,Ymm mask)2038     void Assembler::vgatherdps(Ymm dst, Scale scale, Ymm ix, GP64 base, Ymm mask) {
2039         // Unlike most instructions, no aliasing is permitted here.
2040         SkASSERT(dst != ix);
2041         SkASSERT(dst != mask);
2042         SkASSERT(mask != ix);
2043 
2044         int prefix = 0x66,
2045             map    = 0x380f,
2046             opcode = 0x92;
2047         VEX v = vex(0, dst>>3, ix>>3, base>>3,
2048                     map, mask, /*ymm?*/1, prefix);
2049         this->bytes(v.bytes, v.len);
2050         this->byte(opcode);
2051         this->byte(mod_rm(Mod::Indirect, dst&7, rsp));
2052         this->byte(sib(scale, ix&7, base&7));
2053     }
2054 
2055     // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
2056 
operator ""_mask(unsigned long long bits)2057     static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; }
2058 
op(uint32_t hi,V m,uint32_t lo,V n,V d)2059     void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) {
2060         this->word( (hi & 11_mask) << 21
2061                   | (m  &  5_mask) << 16
2062                   | (lo &  6_mask) << 10
2063                   | (n  &  5_mask) <<  5
2064                   | (d  &  5_mask) <<  0);
2065     }
2066 
and16b(V d,V n,V m)2067     void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); }
orr16b(V d,V n,V m)2068     void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); }
eor16b(V d,V n,V m)2069     void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); }
bic16b(V d,V n,V m)2070     void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); }
bsl16b(V d,V n,V m)2071     void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); }
not16b(V d,V n)2072     void Assembler::not16b(V d, V n)      { this->op(0b0'1'1'01110'00'10000'00101'10,  n, d); }
2073 
add4s(V d,V n,V m)2074     void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); }
sub4s(V d,V n,V m)2075     void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); }
mul4s(V d,V n,V m)2076     void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); }
2077 
cmeq4s(V d,V n,V m)2078     void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); }
cmgt4s(V d,V n,V m)2079     void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); }
2080 
sub8h(V d,V n,V m)2081     void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); }
mul8h(V d,V n,V m)2082     void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); }
2083 
fadd4s(V d,V n,V m)2084     void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); }
fsub4s(V d,V n,V m)2085     void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); }
fmul4s(V d,V n,V m)2086     void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); }
fdiv4s(V d,V n,V m)2087     void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); }
fmin4s(V d,V n,V m)2088     void Assembler::fmin4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11110'1, n, d); }
fmax4s(V d,V n,V m)2089     void Assembler::fmax4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11110'1, n, d); }
fneg4s(V d,V n)2090     void Assembler::fneg4s(V d, V n)      { this->op(0b0'1'1'01110'1'0'10000'01111'10,  n, d); }
2091 
fcmeq4s(V d,V n,V m)2092     void Assembler::fcmeq4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b1110'0'1, n, d); }
fcmgt4s(V d,V n,V m)2093     void Assembler::fcmgt4s(V d, V n, V m) { this->op(0b0'1'1'01110'1'0'1, m, 0b1110'0'1, n, d); }
fcmge4s(V d,V n,V m)2094     void Assembler::fcmge4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b1110'0'1, n, d); }
2095 
fmla4s(V d,V n,V m)2096     void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); }
fmls4s(V d,V n,V m)2097     void Assembler::fmls4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11001'1, n, d); }
2098 
tbl(V d,V n,V m)2099     void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }
2100 
op(uint32_t op22,int imm,V n,V d)2101     void Assembler::op(uint32_t op22, int imm, V n, V d) {
2102         this->word( (op22 & 22_mask) << 10
2103                   | imm              << 16   // imm is embedded inside op, bit size depends on op
2104                   | (n    &  5_mask) <<  5
2105                   | (d    &  5_mask) <<  0);
2106     }
2107 
sli4s(V d,V n,int imm)2108     void Assembler::sli4s(V d, V n, int imm) {
2109         this->op(0b0'1'1'011110'0100'000'01010'1,    ( imm&31), n, d);
2110     }
shl4s(V d,V n,int imm)2111     void Assembler::shl4s(V d, V n, int imm) {
2112         this->op(0b0'1'0'011110'0100'000'01010'1,    ( imm&31), n, d);
2113     }
sshr4s(V d,V n,int imm)2114     void Assembler::sshr4s(V d, V n, int imm) {
2115         this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
2116     }
ushr4s(V d,V n,int imm)2117     void Assembler::ushr4s(V d, V n, int imm) {
2118         this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
2119     }
ushr8h(V d,V n,int imm)2120     void Assembler::ushr8h(V d, V n, int imm) {
2121         this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, (-imm&15), n, d);
2122     }
2123 
scvtf4s(V d,V n)2124     void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
fcvtzs4s(V d,V n)2125     void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
fcvtns4s(V d,V n)2126     void Assembler::fcvtns4s(V d, V n) { this->op(0b0'1'0'01110'0'0'10000'1101'0'10, n,d); }
2127 
xtns2h(V d,V n)2128     void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
xtnh2b(V d,V n)2129     void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
2130 
uxtlb2h(V d,V n)2131     void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); }
uxtlh2s(V d,V n)2132     void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); }
2133 
uminv4s(V d,V n)2134     void Assembler::uminv4s(V d, V n) { this->op(0b0'1'1'01110'10'11000'1'1010'10, n,d); }
2135 
brk(int imm16)2136     void Assembler::brk(int imm16) {
2137         this->word(0b11010100'001'0000000000000000'000'00
2138                   | (imm16 & 16_mask) << 5);
2139     }
2140 
ret(X n)2141     void Assembler::ret(X n) {
2142         this->word(0b1101011'0'0'10'11111'0000'0'0 << 10
2143                   | (n & 5_mask) << 5);
2144     }
2145 
add(X d,X n,int imm12)2146     void Assembler::add(X d, X n, int imm12) {
2147         this->word(0b1'0'0'10001'00   << 22
2148                   | (imm12 & 12_mask) << 10
2149                   | (n     &  5_mask) <<  5
2150                   | (d     &  5_mask) <<  0);
2151     }
sub(X d,X n,int imm12)2152     void Assembler::sub(X d, X n, int imm12) {
2153         this->word( 0b1'1'0'10001'00  << 22
2154                   | (imm12 & 12_mask) << 10
2155                   | (n     &  5_mask) <<  5
2156                   | (d     &  5_mask) <<  0);
2157     }
subs(X d,X n,int imm12)2158     void Assembler::subs(X d, X n, int imm12) {
2159         this->word( 0b1'1'1'10001'00  << 22
2160                   | (imm12 & 12_mask) << 10
2161                   | (n     &  5_mask) <<  5
2162                   | (d     &  5_mask) <<  0);
2163     }
2164 
b(Condition cond,Label * l)2165     void Assembler::b(Condition cond, Label* l) {
2166         const int imm19 = this->disp19(l);
2167         this->word( 0b0101010'0           << 24
2168                   | (imm19     & 19_mask) <<  5
2169                   | ((int)cond &  4_mask) <<  0);
2170     }
cbz(X t,Label * l)2171     void Assembler::cbz(X t, Label* l) {
2172         const int imm19 = this->disp19(l);
2173         this->word( 0b1'011010'0      << 24
2174                   | (imm19 & 19_mask) <<  5
2175                   | (t     &  5_mask) <<  0);
2176     }
cbnz(X t,Label * l)2177     void Assembler::cbnz(X t, Label* l) {
2178         const int imm19 = this->disp19(l);
2179         this->word( 0b1'011010'1      << 24
2180                   | (imm19 & 19_mask) <<  5
2181                   | (t     &  5_mask) <<  0);
2182     }
2183 
ldrq(V dst,X src)2184     void Assembler::ldrq(V dst, X src) { this->op(0b00'111'1'01'11'000000000000, src, dst); }
ldrs(V dst,X src)2185     void Assembler::ldrs(V dst, X src) { this->op(0b10'111'1'01'01'000000000000, src, dst); }
ldrb(V dst,X src)2186     void Assembler::ldrb(V dst, X src) { this->op(0b00'111'1'01'01'000000000000, src, dst); }
2187 
strq(V src,X dst)2188     void Assembler::strq(V src, X dst) { this->op(0b00'111'1'01'10'000000000000, dst, src); }
strs(V src,X dst)2189     void Assembler::strs(V src, X dst) { this->op(0b10'111'1'01'00'000000000000, dst, src); }
strb(V src,X dst)2190     void Assembler::strb(V src, X dst) { this->op(0b00'111'1'01'00'000000000000, dst, src); }
2191 
fmovs(X dst,V src)2192     void Assembler::fmovs(X dst, V src) {
2193         this->word(0b0'0'0'11110'00'1'00'110'000000 << 10
2194                   | (src & 5_mask)                  << 5
2195                   | (dst & 5_mask)                  << 0);
2196     }
2197 
ldrq(V dst,Label * l)2198     void Assembler::ldrq(V dst, Label* l) {
2199         const int imm19 = this->disp19(l);
2200         this->word( 0b10'011'1'00     << 24
2201                   | (imm19 & 19_mask) << 5
2202                   | (dst   &  5_mask) << 0);
2203     }
2204 
label(Label * l)2205     void Assembler::label(Label* l) {
2206         if (fCode) {
2207             // The instructions all currently point to l->offset.
2208             // We'll want to add a delta to point them to here().
2209             int delta = here().offset - l->offset;
2210             l->offset = here().offset;
2211 
2212             if (l->kind == Label::ARMDisp19) {
2213                 for (int ref : l->references) {
2214                     // ref points to a 32-bit instruction with 19-bit displacement in instructions.
2215                     uint32_t inst;
2216                     memcpy(&inst, fCode + ref, 4);
2217 
2218                     // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ]
2219                     int disp = (int)(inst << 8) >> 13;
2220 
2221                     disp += delta/4;  // delta is in bytes, we want instructions.
2222 
2223                     // Put it all back together, preserving the high 8 bits and low 5.
2224                     inst = ((disp << 5) &  (19_mask << 5))
2225                          | ((inst     ) & ~(19_mask << 5));
2226 
2227                     memcpy(fCode + ref, &inst, 4);
2228                 }
2229             }
2230 
2231             if (l->kind == Label::X86Disp32) {
2232                 for (int ref : l->references) {
2233                     // ref points to a 32-bit displacement in bytes.
2234                     int disp;
2235                     memcpy(&disp, fCode + ref, 4);
2236 
2237                     disp += delta;
2238 
2239                     memcpy(fCode + ref, &disp, 4);
2240                 }
2241             }
2242         }
2243     }
2244 
eval(int n,void * args[]) const2245     void Program::eval(int n, void* args[]) const {
2246     #define SKVM_JIT_STATS 0
2247     #if SKVM_JIT_STATS
2248         static std::atomic<int64_t>  calls{0}, jits{0},
2249                                     pixels{0}, fast{0};
2250         pixels += n;
2251         if (0 == calls++) {
2252             atexit([]{
2253                 int64_t num = jits .load(),
2254                         den = calls.load();
2255                 SkDebugf("%.3g%% of %lld eval() calls went through JIT.\n", (100.0 * num)/den, den);
2256                 num = fast  .load();
2257                 den = pixels.load();
2258                 SkDebugf("%.3g%% of %lld pixels went through JIT.\n", (100.0 * num)/den, den);
2259             });
2260         }
2261     #endif
2262         // This may fail either simply because we can't JIT, or when using LLVM,
2263         // because the work represented by fImpl->llvm_compiling hasn't finished yet.
2264         if (const void* b = fImpl->jit_entry.load()) {
2265     #if SKVM_JIT_STATS
2266             jits++;
2267             fast += n;
2268     #endif
2269             void** a = args;
2270             switch (fImpl->strides.size()) {
2271                 case 0: return ((void(*)(int                        ))b)(n                    );
2272                 case 1: return ((void(*)(int,void*                  ))b)(n,a[0]               );
2273                 case 2: return ((void(*)(int,void*,void*            ))b)(n,a[0],a[1]          );
2274                 case 3: return ((void(*)(int,void*,void*,void*      ))b)(n,a[0],a[1],a[2]     );
2275                 case 4: return ((void(*)(int,void*,void*,void*,void*))b)(n,a[0],a[1],a[2],a[3]);
2276                 case 5: return ((void(*)(int,void*,void*,void*,void*,void*))b)
2277                                 (n,a[0],a[1],a[2],a[3],a[4]);
2278                 default: SkUNREACHABLE;  // TODO
2279             }
2280         }
2281 
2282         // So we'll sometimes use the interpreter here even if later calls will use the JIT.
2283         SkOpts::interpret_skvm(fImpl->instructions.data(), (int)fImpl->instructions.size(),
2284                                this->nregs(), this->loop(), fImpl->strides.data(), this->nargs(),
2285                                n, args);
2286     }
2287 
2288 #if defined(SKVM_LLVM)
setupLLVM(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)2289     void Program::setupLLVM(const std::vector<OptimizedInstruction>& instructions,
2290                             const char* debug_name) {
2291         auto ctx = std::make_unique<llvm::LLVMContext>();
2292 
2293         auto mod = std::make_unique<llvm::Module>("", *ctx);
2294         // All the scary bare pointers from here on are owned by ctx or mod, I think.
2295 
2296         // Everything I've tested runs faster at K=8 (using ymm) than K=16 (zmm) on SKX machines.
2297         const int K = (true && SkCpu::Supports(SkCpu::HSW)) ? 8 : 4;
2298 
2299         llvm::Type *ptr = llvm::Type::getInt8Ty(*ctx)->getPointerTo(),
2300                    *i32 = llvm::Type::getInt32Ty(*ctx);
2301 
2302         std::vector<llvm::Type*> arg_types = { i32 };
2303         for (size_t i = 0; i < fImpl->strides.size(); i++) {
2304             arg_types.push_back(ptr);
2305         }
2306 
2307         llvm::FunctionType* fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*ctx),
2308                                                               arg_types, /*vararg?=*/false);
2309         llvm::Function* fn
2310             = llvm::Function::Create(fn_type, llvm::GlobalValue::ExternalLinkage, debug_name, *mod);
2311         for (size_t i = 0; i < fImpl->strides.size(); i++) {
2312             fn->addParamAttr(i+1, llvm::Attribute::NoAlias);
2313         }
2314 
2315         llvm::BasicBlock *enter  = llvm::BasicBlock::Create(*ctx, "enter" , fn),
2316                          *hoistK = llvm::BasicBlock::Create(*ctx, "hoistK", fn),
2317                          *testK  = llvm::BasicBlock::Create(*ctx, "testK" , fn),
2318                          *loopK  = llvm::BasicBlock::Create(*ctx, "loopK" , fn),
2319                          *hoist1 = llvm::BasicBlock::Create(*ctx, "hoist1", fn),
2320                          *test1  = llvm::BasicBlock::Create(*ctx, "test1" , fn),
2321                          *loop1  = llvm::BasicBlock::Create(*ctx, "loop1" , fn),
2322                          *leave  = llvm::BasicBlock::Create(*ctx, "leave" , fn);
2323 
2324         using IRBuilder = llvm::IRBuilder<>;
2325 
2326         llvm::PHINode*                 n;
2327         std::vector<llvm::PHINode*> args;
2328         std::vector<llvm::Value*> vals(instructions.size());
2329 
2330         auto emit = [&](size_t i, bool scalar, IRBuilder* b) {
2331             auto [op, x,y,z, immy,immz, death,can_hoist,used_in_loop] = instructions[i];
2332 
2333             llvm::Type *i1    = llvm::Type::getInt1Ty (*ctx),
2334                        *i8    = llvm::Type::getInt8Ty (*ctx),
2335                        *i8x4  = llvm::VectorType::get(i8, 4),
2336                        *i16   = llvm::Type::getInt16Ty(*ctx),
2337                        *i16x2 = llvm::VectorType::get(i16, 2),
2338                        *f32   = llvm::Type::getFloatTy(*ctx),
2339                        *I1    = scalar ? i1    : llvm::VectorType::get(i1 , K  ),
2340                        *I8    = scalar ? i8    : llvm::VectorType::get(i8 , K  ),
2341                        *I8x4  = scalar ? i8x4  : llvm::VectorType::get(i8 , K*4),
2342                        *I16   = scalar ? i16   : llvm::VectorType::get(i16, K  ),
2343                        *I16x2 = scalar ? i16x2 : llvm::VectorType::get(i16, K*2),
2344                        *I32   = scalar ? i32   : llvm::VectorType::get(i32, K  ),
2345                        *F32   = scalar ? f32   : llvm::VectorType::get(f32, K  );
2346 
2347             auto I  = [&](llvm::Value* v) { return b->CreateBitCast(v, I32  ); };
2348             auto F  = [&](llvm::Value* v) { return b->CreateBitCast(v, F32  ); };
2349             auto x2 = [&](llvm::Value* v) { return b->CreateBitCast(v, I16x2); };
2350 
2351             auto S = [&](llvm::Type* dst, llvm::Value* v) { return b->CreateSExt(v, dst); };
2352 
2353             switch (llvm::Type* t = nullptr; op) {
2354                 default:
2355                     SkDebugf("can't llvm %s (%d)\n", name(op), op);
2356                     return false;
2357 
2358                 case Op::assert_true: /*TODO*/ break;
2359 
2360                 case Op::index:
2361                     if (I32->isVectorTy()) {
2362                         std::vector<llvm::Constant*> iota(K);
2363                         for (int j = 0; j < K; j++) {
2364                             iota[j] = b->getInt32(j);
2365                         }
2366                         vals[i] = b->CreateSub(b->CreateVectorSplat(K, n),
2367                                                llvm::ConstantVector::get(iota));
2368                     } else {
2369                         vals[i] = n;
2370                     } break;
2371 
2372                 case Op::load8:  t = I8 ; goto load;
2373                 case Op::load16: t = I16; goto load;
2374                 case Op::load32: t = I32; goto load;
2375                 load: {
2376                     llvm::Value* ptr = b->CreateBitCast(args[immy], t->getPointerTo());
2377                     vals[i] = b->CreateZExt(b->CreateAlignedLoad(ptr, 1), I32);
2378                 } break;
2379 
2380 
2381                 case Op::splat: vals[i] = llvm::ConstantInt::get(I32, immy); break;
2382 
2383                 case Op::uniform8:  t = i8 ; goto uniform;
2384                 case Op::uniform16: t = i16; goto uniform;
2385                 case Op::uniform32: t = i32; goto uniform;
2386                 uniform: {
2387                     llvm::Value* ptr = b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr,
2388                                                                                       args[immy],
2389                                                                                       immz),
2390                                                         t->getPointerTo());
2391                     llvm::Value* val = b->CreateZExt(b->CreateAlignedLoad(ptr, 1), i32);
2392                     vals[i] = I32->isVectorTy() ? b->CreateVectorSplat(K, val)
2393                                                 : val;
2394                 } break;
2395 
2396                 case Op::gather8:  t = i8 ; goto gather;
2397                 case Op::gather16: t = i16; goto gather;
2398                 case Op::gather32: t = i32; goto gather;
2399                 gather: {
2400                     // Our gather base pointer is immz bytes off of uniform immy.
2401                     llvm::Value* base =
2402                         b->CreateLoad(b->CreateBitCast(b->CreateConstInBoundsGEP1_32(nullptr,
2403                                                                                      args[immy],
2404                                                                                      immz),
2405                                                        t->getPointerTo()->getPointerTo()));
2406 
2407                     llvm::Value* ptr = b->CreateInBoundsGEP(nullptr, base, vals[x]);
2408                     llvm::Value* gathered;
2409                     if (ptr->getType()->isVectorTy()) {
2410                         gathered = b->CreateMaskedGather(ptr, 1);
2411                     } else {
2412                         gathered = b->CreateAlignedLoad(ptr, 1);
2413                     }
2414                     vals[i] = b->CreateZExt(gathered, I32);
2415                 } break;
2416 
2417                 case Op::store8:  t = I8 ; goto store;
2418                 case Op::store16: t = I16; goto store;
2419                 case Op::store32: t = I32; goto store;
2420                 store: {
2421                     llvm::Value* val = b->CreateTrunc(vals[x], t);
2422                     llvm::Value* ptr = b->CreateBitCast(args[immy],
2423                                                         val->getType()->getPointerTo());
2424                     vals[i] = b->CreateAlignedStore(val, ptr, 1);
2425                 } break;
2426 
2427                 case Op::bit_and:   vals[i] = b->CreateAnd(vals[x], vals[y]); break;
2428                 case Op::bit_or :   vals[i] = b->CreateOr (vals[x], vals[y]); break;
2429                 case Op::bit_xor:   vals[i] = b->CreateXor(vals[x], vals[y]); break;
2430                 case Op::bit_clear: vals[i] = b->CreateAnd(vals[x], b->CreateNot(vals[y])); break;
2431 
2432                 case Op::pack: vals[i] = b->CreateOr(vals[x], b->CreateShl(vals[y], immz)); break;
2433 
2434                 case Op::select:
2435                     vals[i] = b->CreateSelect(b->CreateTrunc(vals[x], I1), vals[y], vals[z]);
2436                     break;
2437 
2438                 case Op::add_i32: vals[i] = b->CreateAdd(vals[x], vals[y]); break;
2439                 case Op::sub_i32: vals[i] = b->CreateSub(vals[x], vals[y]); break;
2440                 case Op::mul_i32: vals[i] = b->CreateMul(vals[x], vals[y]); break;
2441 
2442                 case Op::shl_i32: vals[i] = b->CreateShl (vals[x], immy); break;
2443                 case Op::sra_i32: vals[i] = b->CreateAShr(vals[x], immy); break;
2444                 case Op::shr_i32: vals[i] = b->CreateLShr(vals[x], immy); break;
2445 
2446                 case Op:: eq_i32: vals[i] = S(I32, b->CreateICmpEQ (vals[x], vals[y])); break;
2447                 case Op::neq_i32: vals[i] = S(I32, b->CreateICmpNE (vals[x], vals[y])); break;
2448                 case Op:: gt_i32: vals[i] = S(I32, b->CreateICmpSGT(vals[x], vals[y])); break;
2449                 case Op::gte_i32: vals[i] = S(I32, b->CreateICmpSGE(vals[x], vals[y])); break;
2450 
2451                 case Op::add_f32: vals[i] = I(b->CreateFAdd(F(vals[x]), F(vals[y]))); break;
2452                 case Op::sub_f32: vals[i] = I(b->CreateFSub(F(vals[x]), F(vals[y]))); break;
2453                 case Op::mul_f32: vals[i] = I(b->CreateFMul(F(vals[x]), F(vals[y]))); break;
2454                 case Op::div_f32: vals[i] = I(b->CreateFDiv(F(vals[x]), F(vals[y]))); break;
2455 
2456                 case Op:: eq_f32: vals[i] = S(I32, b->CreateFCmpOEQ(F(vals[x]), F(vals[y]))); break;
2457                 case Op::neq_f32: vals[i] = S(I32, b->CreateFCmpUNE(F(vals[x]), F(vals[y]))); break;
2458                 case Op:: gt_f32: vals[i] = S(I32, b->CreateFCmpOGT(F(vals[x]), F(vals[y]))); break;
2459                 case Op::gte_f32: vals[i] = S(I32, b->CreateFCmpOGE(F(vals[x]), F(vals[y]))); break;
2460 
2461                 case Op::fma_f32:
2462                     vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2463                                                    {F(vals[x]), F(vals[y]), F(vals[z])}));
2464                     break;
2465 
2466                 case Op::fms_f32:
2467                     vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2468                                                    {F(vals[x]), F(vals[y]),
2469                                                     b->CreateFNeg(F(vals[z]))}));
2470                     break;
2471 
2472                 case Op::fnma_f32:
2473                     vals[i] = I(b->CreateIntrinsic(llvm::Intrinsic::fma, {F32},
2474                                                    {b->CreateFNeg(F(vals[x])), F(vals[y]),
2475                                                     F(vals[z])}));
2476                     break;
2477 
2478                 case Op::floor:
2479                     vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::floor, F(vals[x])));
2480                     break;
2481 
2482                 case Op::max_f32:
2483                     vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[x]), F(vals[y])),
2484                                                 F(vals[y]), F(vals[x])));
2485                     break;
2486                 case Op::min_f32:
2487                     vals[i] = I(b->CreateSelect(b->CreateFCmpOLT(F(vals[y]), F(vals[x])),
2488                                                 F(vals[y]), F(vals[x])));
2489                     break;
2490 
2491                 case Op::sqrt_f32:
2492                     vals[i] = I(b->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, F(vals[x])));
2493                     break;
2494 
2495                 case Op::to_f32: vals[i] = I(b->CreateSIToFP(  vals[x] , F32)); break;
2496                 case Op::trunc : vals[i] =   b->CreateFPToSI(F(vals[x]), I32) ; break;
2497                 case Op::round : {
2498                     // Basic impl when we can't use cvtps2dq and co.
2499                     auto round = b->CreateUnaryIntrinsic(llvm::Intrinsic::rint, F(vals[x]));
2500                     vals[i] = b->CreateFPToSI(round, I32);
2501 
2502                 #if 1 && defined(SK_CPU_X86)
2503                     // Using b->CreateIntrinsic(..., {}, {...}) to avoid name mangling.
2504                     if (scalar) {
2505                         // cvtss2si is float x4 -> int, ignoring input lanes 1,2,3.  ¯\_(ツ)_/¯
2506                         llvm::Value* v = llvm::UndefValue::get(llvm::VectorType::get(f32, 4));
2507                         v = b->CreateInsertElement(v, F(vals[x]), (uint64_t)0);
2508                         vals[i] = b->CreateIntrinsic(llvm::Intrinsic::x86_sse_cvtss2si, {}, {v});
2509                     } else {
2510                         SkASSERT(K == 4  || K == 8);
2511                         auto intr = K == 4 ?   llvm::Intrinsic::x86_sse2_cvtps2dq :
2512                                  /* K == 8 ?*/ llvm::Intrinsic::x86_avx_cvt_ps2dq_256;
2513                         vals[i] = b->CreateIntrinsic(intr, {}, {F(vals[x])});
2514                     }
2515                 #endif
2516                 } break;
2517 
2518                 case Op::add_i16x2: vals[i] = I(b->CreateAdd(x2(vals[x]), x2(vals[y]))); break;
2519                 case Op::sub_i16x2: vals[i] = I(b->CreateSub(x2(vals[x]), x2(vals[y]))); break;
2520                 case Op::mul_i16x2: vals[i] = I(b->CreateMul(x2(vals[x]), x2(vals[y]))); break;
2521 
2522                 case Op::shl_i16x2: vals[i] = I(b->CreateShl (x2(vals[x]), immy)); break;
2523                 case Op::sra_i16x2: vals[i] = I(b->CreateAShr(x2(vals[x]), immy)); break;
2524                 case Op::shr_i16x2: vals[i] = I(b->CreateLShr(x2(vals[x]), immy)); break;
2525 
2526                 case Op:: eq_i16x2:
2527                     vals[i] = I(S(I16x2, b->CreateICmpEQ (x2(vals[x]), x2(vals[y]))));
2528                     break;
2529                 case Op::neq_i16x2:
2530                     vals[i] = I(S(I16x2, b->CreateICmpNE (x2(vals[x]), x2(vals[y]))));
2531                     break;
2532                 case Op:: gt_i16x2:
2533                     vals[i] = I(S(I16x2, b->CreateICmpSGT(x2(vals[x]), x2(vals[y]))));
2534                     break;
2535                 case Op::gte_i16x2:
2536                     vals[i] = I(S(I16x2, b->CreateICmpSGE(x2(vals[x]), x2(vals[y]))));
2537                     break;
2538 
2539                 case Op::bytes: {
2540                     int N = vals[x]->getType()->isVectorTy() ? K : 1;
2541 
2542                     uint32_t off = 0;
2543                     auto nibble_to_mask = [&](uint8_t n) -> uint32_t {
2544                         switch (n) {
2545                             case 0: return       4*N;   // Select any byte in the second (zero) arg.
2546                             case 1: return off +   0;   // 1st byte in this arg.
2547                             case 2: return off +   1;   // 2nd ...
2548                             case 3: return off +   2;   // 3rd ...
2549                             case 4: return off +   3;   // 4th byte in this arg.
2550                         }
2551                         SkUNREACHABLE;
2552                         return 0;
2553                     };
2554 
2555                     std::vector<uint32_t> mask(N*4);
2556                     for (int i = 0; i < N; i++) {
2557                         mask[4*i+0] = nibble_to_mask( (immy >>  0) & 0xf );
2558                         mask[4*i+1] = nibble_to_mask( (immy >>  4) & 0xf );
2559                         mask[4*i+2] = nibble_to_mask( (immy >>  8) & 0xf );
2560                         mask[4*i+3] = nibble_to_mask( (immy >> 12) & 0xf );
2561                         off += 4;
2562                     }
2563 
2564                     llvm::Value* input =  b->CreateBitCast(vals[x], I8x4);
2565                     llvm::Value* zero  = llvm::Constant::getNullValue(I8x4);
2566                     vals[i] = I(b->CreateShuffleVector(input, zero, mask));
2567                 } break;
2568             }
2569             return true;
2570         };
2571 
2572         {
2573             IRBuilder b(enter);
2574             b.CreateBr(hoistK);
2575         }
2576 
2577         // hoistK: emit each hoistable vector instruction; goto testK;
2578         // LLVM can do this sort of thing itself, but we've got the information cheap,
2579         // and pointer aliasing makes it easier to manually hoist than teach LLVM it's safe.
2580         {
2581             IRBuilder b(hoistK);
2582 
2583             // Hoisted instructions will need args (think, uniforms), so set that up now.
2584             // These phi nodes are degenerate... they'll always be the passed-in args from enter.
2585             // Later on when we start looping the phi nodes will start looking useful.
2586             llvm::Argument* arg = fn->arg_begin();
2587             (void)arg++;  // Leave n as nullptr... it'd be a bug to use n in a hoisted instruction.
2588             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2589                 args.push_back(b.CreatePHI(arg->getType(), 1));
2590                 args.back()->addIncoming(arg++, enter);
2591             }
2592 
2593             for (size_t i = 0; i < instructions.size(); i++) {
2594                 if (instructions[i].can_hoist && !emit(i, false, &b)) {
2595                     return;
2596                 }
2597             }
2598 
2599             b.CreateBr(testK);
2600         }
2601 
2602         // testK:  if (N >= K) goto loopK; else goto hoist1;
2603         {
2604             IRBuilder b(testK);
2605 
2606             // New phi nodes for `n` and each pointer argument from hoistK; later we'll add loopK.
2607             // These also start as the initial function arguments; hoistK can't have changed them.
2608             llvm::Argument* arg = fn->arg_begin();
2609 
2610             n = b.CreatePHI(arg->getType(), 2);
2611             n->addIncoming(arg++, hoistK);
2612 
2613             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2614                 args[i] = b.CreatePHI(arg->getType(), 2);
2615                 args[i]->addIncoming(arg++, hoistK);
2616             }
2617 
2618             b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(K)), loopK, hoist1);
2619         }
2620 
2621         // loopK:  ... insts on K x T vectors; N -= K, args += K*stride; goto testK;
2622         {
2623             IRBuilder b(loopK);
2624             for (size_t i = 0; i < instructions.size(); i++) {
2625                 if (!instructions[i].can_hoist && !emit(i, false, &b)) {
2626                     return;
2627                 }
2628             }
2629 
2630             // n -= K
2631             llvm::Value* n_next = b.CreateSub(n, b.getInt32(K));
2632             n->addIncoming(n_next, loopK);
2633 
2634             // Each arg ptr += K
2635             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2636                 llvm::Value* arg_next
2637                     = b.CreateConstInBoundsGEP1_32(nullptr, args[i], K*fImpl->strides[i]);
2638                 args[i]->addIncoming(arg_next, loopK);
2639             }
2640             b.CreateBr(testK);
2641         }
2642 
2643         // hoist1: emit each hoistable scalar instruction; goto test1;
2644         {
2645             IRBuilder b(hoist1);
2646             for (size_t i = 0; i < instructions.size(); i++) {
2647                 if (instructions[i].can_hoist && !emit(i, true, &b)) {
2648                     return;
2649                 }
2650             }
2651             b.CreateBr(test1);
2652         }
2653 
2654         // test1:  if (N >= 1) goto loop1; else goto leave;
2655         {
2656             IRBuilder b(test1);
2657 
2658             // Set up new phi nodes for `n` and each pointer argument, now from hoist1 and loop1.
2659             llvm::PHINode* n_new = b.CreatePHI(n->getType(), 2);
2660             n_new->addIncoming(n, hoist1);
2661             n = n_new;
2662 
2663             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2664                 llvm::PHINode* arg_new = b.CreatePHI(args[i]->getType(), 2);
2665                 arg_new->addIncoming(args[i], hoist1);
2666                 args[i] = arg_new;
2667             }
2668 
2669             b.CreateCondBr(b.CreateICmpSGE(n, b.getInt32(1)), loop1, leave);
2670         }
2671 
2672         // loop1:  ... insts on scalars; N -= 1, args += stride; goto test1;
2673         {
2674             IRBuilder b(loop1);
2675             for (size_t i = 0; i < instructions.size(); i++) {
2676                 if (!instructions[i].can_hoist && !emit(i, true, &b)) {
2677                     return;
2678                 }
2679             }
2680 
2681             // n -= 1
2682             llvm::Value* n_next = b.CreateSub(n, b.getInt32(1));
2683             n->addIncoming(n_next, loop1);
2684 
2685             // Each arg ptr += K
2686             for (size_t i = 0; i < fImpl->strides.size(); i++) {
2687                 llvm::Value* arg_next
2688                     = b.CreateConstInBoundsGEP1_32(nullptr, args[i], fImpl->strides[i]);
2689                 args[i]->addIncoming(arg_next, loop1);
2690             }
2691             b.CreateBr(test1);
2692         }
2693 
2694         // leave:  ret
2695         {
2696             IRBuilder b(leave);
2697             b.CreateRetVoid();
2698         }
2699 
2700         SkASSERT(false == llvm::verifyModule(*mod, &llvm::outs()));
2701 
2702         if (true) {
2703             SkString path = SkStringPrintf("/tmp/%s.bc", debug_name);
2704             std::error_code err;
2705             llvm::raw_fd_ostream os(path.c_str(), err);
2706             if (err) {
2707                 return;
2708             }
2709             llvm::WriteBitcodeToFile(*mod, os);
2710         }
2711 
2712         static SkOnce once;
2713         once([]{
2714             SkAssertResult(false == llvm::InitializeNativeTarget());
2715             SkAssertResult(false == llvm::InitializeNativeTargetAsmPrinter());
2716         });
2717 
2718         if (llvm::ExecutionEngine* ee = llvm::EngineBuilder(std::move(mod))
2719                                             .setEngineKind(llvm::EngineKind::JIT)
2720                                             .setMCPU(llvm::sys::getHostCPUName())
2721                                             .create()) {
2722             fImpl->llvm_ctx = std::move(ctx);
2723             fImpl->llvm_ee.reset(ee);
2724 
2725             // We have to be careful here about what we close over and how, in case fImpl moves.
2726             // fImpl itself may change, but its pointee fields won't, so close over them by value.
2727             // Also, debug_name will almost certainly leave scope, so copy it.
2728             fImpl->llvm_compiling = std::async(std::launch::async, [dst  = &fImpl->jit_entry,
2729                                                                     ee   =  fImpl->llvm_ee.get(),
2730                                                                     name = std::string(debug_name)]{
2731                 // std::atomic<void*>*    dst;
2732                 // llvm::ExecutionEngine* ee;
2733                 // std::string            name;
2734                 dst->store( (void*)ee->getFunctionAddress(name.c_str()) );
2735             });
2736         }
2737     }
2738 #endif
2739 
waitForLLVM() const2740     void Program::waitForLLVM() const {
2741     #if defined(SKVM_LLVM)
2742         if (fImpl->llvm_compiling.valid()) {
2743             fImpl->llvm_compiling.wait();
2744         }
2745     #endif
2746     }
2747 
hasJIT() const2748     bool Program::hasJIT() const {
2749         // Program::hasJIT() is really just a debugging / test aid,
2750         // so we don't mind adding a sync point here to wait for compilation.
2751         this->waitForLLVM();
2752 
2753         return fImpl->jit_entry.load() != nullptr;
2754     }
2755 
dropJIT()2756     void Program::dropJIT() {
2757     #if defined(SKVM_LLVM)
2758         this->waitForLLVM();
2759         fImpl->llvm_ee .reset(nullptr);
2760         fImpl->llvm_ctx.reset(nullptr);
2761     #elif defined(SKVM_JIT)
2762         if (fImpl->dylib) {
2763             dlclose(fImpl->dylib);
2764         } else if (auto jit_entry = fImpl->jit_entry.load()) {
2765             munmap(jit_entry, fImpl->jit_size);
2766         }
2767     #else
2768         SkASSERT(!this->hasJIT());
2769     #endif
2770 
2771         fImpl->jit_entry.store(nullptr);
2772         fImpl->jit_size  = 0;
2773         fImpl->dylib     = nullptr;
2774     }
2775 
Program()2776     Program::Program() : fImpl(std::make_unique<Impl>()) {}
2777 
~Program()2778     Program::~Program() {
2779         // Moved-from Programs may have fImpl == nullptr.
2780         if (fImpl) {
2781             this->dropJIT();
2782         }
2783     }
2784 
Program(Program && other)2785     Program::Program(Program&& other) : fImpl(std::move(other.fImpl)) {}
2786 
operator =(Program && other)2787     Program& Program::operator=(Program&& other) {
2788         fImpl = std::move(other.fImpl);
2789         return *this;
2790     }
2791 
Program(const std::vector<OptimizedInstruction> & interpreter,const std::vector<int> & strides)2792     Program::Program(const std::vector<OptimizedInstruction>& interpreter,
2793                      const std::vector<int>& strides) : Program() {
2794         fImpl->strides = strides;
2795         this->setupInterpreter(interpreter);
2796     }
2797 
Program(const std::vector<OptimizedInstruction> & interpreter,const std::vector<OptimizedInstruction> & jit,const std::vector<int> & strides,const char * debug_name)2798     Program::Program(const std::vector<OptimizedInstruction>& interpreter,
2799                      const std::vector<OptimizedInstruction>& jit,
2800                      const std::vector<int>& strides,
2801                      const char* debug_name) : Program() {
2802         fImpl->strides = strides;
2803     #if 1 && defined(SKVM_LLVM)
2804         this->setupLLVM(interpreter, debug_name);
2805     #elif 1 && defined(SKVM_JIT)
2806         this->setupJIT(jit, debug_name);
2807     #endif
2808 
2809         // Might as well do this after setupLLVM() to get a little more time to compile.
2810         this->setupInterpreter(interpreter);
2811     }
2812 
instructions() const2813     std::vector<InterpreterInstruction> Program::instructions() const { return fImpl->instructions; }
nargs() const2814     int  Program::nargs() const { return (int)fImpl->strides.size(); }
nregs() const2815     int  Program::nregs() const { return fImpl->regs; }
loop() const2816     int  Program::loop () const { return fImpl->loop; }
empty() const2817     bool Program::empty() const { return fImpl->instructions.empty(); }
2818 
2819     // Translate OptimizedInstructions to InterpreterInstructions.
setupInterpreter(const std::vector<OptimizedInstruction> & instructions)2820     void Program::setupInterpreter(const std::vector<OptimizedInstruction>& instructions) {
2821         // Register each instruction is assigned to.
2822         std::vector<Reg> reg(instructions.size());
2823 
2824         // This next bit is a bit more complicated than strictly necessary;
2825         // we could just assign every instruction to its own register.
2826         //
2827         // But recycling registers is fairly cheap, and good practice for the
2828         // JITs where minimizing register pressure really is important.
2829         //
2830         // Since we have effectively infinite registers, we hoist any value we can.
2831         // (The JIT may choose a more complex policy to reduce register pressure.)
2832         auto hoisted = [&](Val id) { return instructions[id].can_hoist; };
2833 
2834         fImpl->regs = 0;
2835         std::vector<Reg> avail;
2836 
2837         // Assign this value to a register, recycling them where we can.
2838         auto assign_register = [&](Val id) {
2839             const OptimizedInstruction& inst = instructions[id];
2840 
2841             // If this is a real input and it's lifetime ends at this instruction,
2842             // we can recycle the register it's occupying.
2843             auto maybe_recycle_register = [&](Val input) {
2844                 if (input != NA
2845                         && instructions[input].death == id
2846                         && !(hoisted(input) && instructions[input].used_in_loop)) {
2847                     avail.push_back(reg[input]);
2848                 }
2849             };
2850 
2851             // Take care to not recycle the same register twice.
2852             if (true                                ) { maybe_recycle_register(inst.x); }
2853             if (inst.y != inst.x                    ) { maybe_recycle_register(inst.y); }
2854             if (inst.z != inst.x && inst.z != inst.y) { maybe_recycle_register(inst.z); }
2855 
2856             // Instructions that die at themselves (stores) don't need a register.
2857             if (inst.death != id) {
2858                 // Allocate a register if we have to, preferring to reuse anything available.
2859                 if (avail.empty()) {
2860                     reg[id] = fImpl->regs++;
2861                 } else {
2862                     reg[id] = avail.back();
2863                     avail.pop_back();
2864                 }
2865             }
2866         };
2867 
2868         // Assign a register to each hoisted instruction, then each non-hoisted loop instruction.
2869         for (Val id = 0; id < (Val)instructions.size(); id++) {
2870             if ( hoisted(id)) { assign_register(id); }
2871         }
2872         for (Val id = 0; id < (Val)instructions.size(); id++) {
2873             if (!hoisted(id)) { assign_register(id); }
2874         }
2875 
2876         // Translate OptimizedInstructions to InterpreterIstructions by mapping values to
2877         // registers.  This will be two passes, first hoisted instructions, then inside the loop.
2878 
2879         // The loop begins at the fImpl->loop'th Instruction.
2880         fImpl->loop = 0;
2881         fImpl->instructions.reserve(instructions.size());
2882 
2883         // Add a dummy mapping for the N/A sentinel Val to any arbitrary register
2884         // so lookups don't have to know which arguments are used by which Ops.
2885         auto lookup_register = [&](Val id) {
2886             return id == NA ? (Reg)0
2887                             : reg[id];
2888         };
2889 
2890         auto push_instruction = [&](Val id, const OptimizedInstruction& inst) {
2891             InterpreterInstruction pinst{
2892                 inst.op,
2893                 lookup_register(id),
2894                 lookup_register(inst.x),
2895                {lookup_register(inst.y)},
2896                {lookup_register(inst.z)},
2897             };
2898             if (inst.y == NA) { pinst.immy = inst.immy; }
2899             if (inst.z == NA) { pinst.immz = inst.immz; }
2900             fImpl->instructions.push_back(pinst);
2901         };
2902 
2903         for (Val id = 0; id < (Val)instructions.size(); id++) {
2904             const OptimizedInstruction& inst = instructions[id];
2905             if (hoisted(id)) {
2906                 push_instruction(id, inst);
2907                 fImpl->loop++;
2908             }
2909         }
2910         for (Val id = 0; id < (Val)instructions.size(); id++) {
2911             const OptimizedInstruction& inst = instructions[id];
2912             if (!hoisted(id)) {
2913                 push_instruction(id, inst);
2914             }
2915         }
2916     }
2917 
2918 #if defined(SKVM_JIT)
2919 
2920     // Just so happens that we can translate the immediate control for our bytes() op
2921     // to a single 128-bit mask that can be consumed by both AVX2 vpshufb and NEON tbl!
bytes_control(int imm,int mask[4])2922     static void bytes_control(int imm, int mask[4]) {
2923         auto nibble_to_vpshufb = [](uint8_t n) -> uint8_t {
2924             // 0 -> 0xff,    Fill with zero
2925             // 1 -> 0x00,    Select byte 0
2926             // 2 -> 0x01,         "      1
2927             // 3 -> 0x02,         "      2
2928             // 4 -> 0x03,         "      3
2929             return n - 1;
2930         };
2931         uint8_t control[] = {
2932             nibble_to_vpshufb( (imm >>  0) & 0xf ),
2933             nibble_to_vpshufb( (imm >>  4) & 0xf ),
2934             nibble_to_vpshufb( (imm >>  8) & 0xf ),
2935             nibble_to_vpshufb( (imm >> 12) & 0xf ),
2936         };
2937         for (int i = 0; i < 4; i++) {
2938             mask[i] = (int)control[0] <<  0
2939                     | (int)control[1] <<  8
2940                     | (int)control[2] << 16
2941                     | (int)control[3] << 24;
2942 
2943             // Update each byte that refers to a byte index by 4 to
2944             // point into the next 32-bit lane, but leave any 0xff
2945             // that fills with zero alone.
2946             control[0] += control[0] == 0xff ? 0 : 4;
2947             control[1] += control[1] == 0xff ? 0 : 4;
2948             control[2] += control[2] == 0xff ? 0 : 4;
2949             control[3] += control[3] == 0xff ? 0 : 4;
2950         }
2951     }
2952 
jit(const std::vector<OptimizedInstruction> & instructions,const bool try_hoisting,Assembler * a) const2953     bool Program::jit(const std::vector<OptimizedInstruction>& instructions,
2954                       const bool try_hoisting,
2955                       Assembler* a) const {
2956         using A = Assembler;
2957 
2958         auto debug_dump = [&] {
2959         #if 0
2960             SkDebugfStream stream;
2961             this->dump(&stream);
2962             return true;
2963         #else
2964             return false;
2965         #endif
2966         };
2967 
2968     #if defined(__x86_64__)
2969         if (!SkCpu::Supports(SkCpu::HSW)) {
2970             return false;
2971         }
2972         A::GP64 N        = A::rdi,
2973                 scratch  = A::rax,
2974                 scratch2 = A::r11,
2975                 arg[]    = { A::rsi, A::rdx, A::rcx, A::r8, A::r9 };
2976 
2977         // All 16 ymm registers are available to use.
2978         using Reg = A::Ymm;
2979         uint32_t avail = 0xffff;
2980 
2981     #elif defined(__aarch64__)
2982         A::X N       = A::x0,
2983              scratch = A::x8,
2984              arg[]   = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
2985 
2986         // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15.
2987         using Reg = A::V;
2988         uint32_t avail = 0xffff00ff;
2989     #endif
2990 
2991         if (SK_ARRAY_COUNT(arg) < fImpl->strides.size()) {
2992             return false;
2993         }
2994 
2995         auto hoisted = [&](Val id) { return try_hoisting && instructions[id].can_hoist; };
2996 
2997         std::vector<Reg> r(instructions.size());
2998 
2999         struct LabelAndReg {
3000             A::Label label;
3001             Reg      reg;
3002         };
3003         SkTHashMap<int, LabelAndReg> constants,    // All constants share the same pool.
3004                                      bytes_masks;  // These vary per-lane.
3005         LabelAndReg                  iota;         // Exists _only_ to vary per-lane.
3006 
3007         auto warmup = [&](Val id) {
3008             const OptimizedInstruction& inst = instructions[id];
3009 
3010             switch (inst.op) {
3011                 default: break;
3012 
3013                 case Op::bytes: if (!bytes_masks.find(inst.immy)) {
3014                                     bytes_masks.set(inst.immy, {});
3015                                     if (try_hoisting) {
3016                                         // vpshufb can always work with the mask from memory,
3017                                         // but it helps to hoist the mask to a register for tbl.
3018                                     #if defined(__aarch64__)
3019                                         LabelAndReg* entry = bytes_masks.find(inst.immy);
3020                                         if (int found = __builtin_ffs(avail)) {
3021                                             entry->reg = (Reg)(found-1);
3022                                             avail ^= 1 << entry->reg;
3023                                             a->ldrq(entry->reg, &entry->label);
3024                                         } else {
3025                                             return false;
3026                                         }
3027                                     #endif
3028                                     }
3029                                 }
3030                                 break;
3031             }
3032             return true;
3033         };
3034 
3035         auto emit = [&](Val id, bool scalar) {
3036             const OptimizedInstruction& inst = instructions[id];
3037 
3038             Op op = inst.op;
3039             Val x = inst.x,
3040                 y = inst.y,
3041                 z = inst.z;
3042             int immy = inst.immy,
3043                 immz = inst.immz;
3044 
3045             // Most (but not all) ops create an output value and need a register to hold it, dst.
3046             // We track each instruction's dst in r[] so we can thread it through as an input
3047             // to any future instructions needing that value.
3048             //
3049             // And some ops may need a temporary register, tmp.  Some need both tmp and dst.
3050             //
3051             // tmp and dst are very similar and can and will often be assigned the same register,
3052             // but tmp may never alias any of the instructions's inputs, while dst may when this
3053             // instruction consumes that input, i.e. if the input reaches its end of life here.
3054             //
3055             // We'll assign both registers lazily to keep register pressure as low as possible.
3056             bool tmp_is_set = false,
3057                  dst_is_set = false;
3058             Reg tmp_reg = (Reg)0;  // This initial value won't matter... anything legal is fine.
3059 
3060             bool ok = true;   // Set to false if we need to assign a register and none's available.
3061 
3062             // First lock in how to choose tmp if we need to based on the registers
3063             // available before this instruction, not including any of its input registers.
3064             auto tmp = [&,avail/*important, closing over avail's current value*/]{
3065                 if (!tmp_is_set) {
3066                     tmp_is_set = true;
3067                     if (int found = __builtin_ffs(avail)) {
3068                         // This is a temporary register just for this op,
3069                         // so we leave it marked available for future ops.
3070                         tmp_reg = (Reg)(found - 1);
3071                     } else {
3072                         // We needed a tmp register but couldn't find one available. :'(
3073                         // This will cause emit() to return false, in turn causing jit() to fail.
3074                         if (debug_dump()) {
3075                             SkDebugf("\nCould not find a register to hold tmp\n");
3076                         }
3077                         ok = false;
3078                     }
3079                 }
3080                 return tmp_reg;
3081             };
3082 
3083             // Now make available any registers that are consumed by this instruction.
3084             // (The register pool we can pick dst from is >= the pool for tmp, adding any of these.)
3085             auto maybe_recycle_register = [&](Val input) {
3086                 if (input != NA
3087                         && instructions[input].death == id
3088                         && !(hoisted(input) && instructions[input].used_in_loop)) {
3089                     avail |= 1 << r[input];
3090                 }
3091             };
3092             maybe_recycle_register(x);
3093             maybe_recycle_register(y);
3094             maybe_recycle_register(z);
3095             // set_dst() and dst() will work read/write with this perhaps-just-updated avail.
3096 
3097             // Some ops may decide dst on their own to best fit the instruction (see Op::fma_f32).
3098             auto set_dst = [&](Reg reg){
3099                 SkASSERT(dst_is_set == false);
3100                 dst_is_set = true;
3101 
3102                 SkASSERT(avail & (1<<reg));
3103                 avail ^= 1<<reg;
3104 
3105                 r[id] = reg;
3106             };
3107 
3108             // Thanks to AVX and NEON's 3-argument instruction sets,
3109             // most ops can use any register as dst.
3110             auto dst = [&]{
3111                 if (!dst_is_set) {
3112                     if (int found = __builtin_ffs(avail)) {
3113                         set_dst((Reg)(found-1));
3114                     } else {
3115                         // Same deal as with tmp... all the registers are occupied.  Time to fail!
3116                         if (debug_dump()) {
3117                             SkDebugf("\nCould not find a register to hold value %d\n", id);
3118                         }
3119                         ok = false;
3120                     }
3121                 }
3122                 return r[id];
3123             };
3124 
3125             // Because we use the same logic to pick an arbitrary dst and to pick tmp,
3126             // and we know that tmp will never overlap any of the inputs, `dst() == tmp()`
3127             // is a simple idiom to check that the destination does not overlap any of the inputs.
3128             // Sometimes we can use this knowledge to do better instruction selection.
3129 
3130             // Ok!  Keep in mind that we haven't assigned tmp or dst yet,
3131             // just laid out hooks for how to do so if we need them, depending on the instruction.
3132             //
3133             // Now let's actually assemble the instruction!
3134             switch (op) {
3135                 default:
3136                     if (debug_dump()) {
3137                         SkDEBUGFAILF("\nOp::%s (%d) not yet implemented\n", name(op), op);
3138                     }
3139                     return false;  // TODO: many new ops
3140 
3141             #if defined(__x86_64__)
3142                 case Op::assert_true: {
3143                     a->vptest (r[x], &constants[0xffffffff].label);
3144                     A::Label all_true;
3145                     a->jc(&all_true);
3146                     a->int3();
3147                     a->label(&all_true);
3148                 } break;
3149 
3150                 case Op::store8: if (scalar) { a->vpextrb  (arg[immy], (A::Xmm)r[x], 0); }
3151                                  else        { a->vpackusdw(tmp(), r[x], r[x]);
3152                                                a->vpermq   (tmp(), tmp(), 0xd8);
3153                                                a->vpackuswb(tmp(), tmp(), tmp());
3154                                                a->vmovq    (arg[immy], (A::Xmm)tmp()); }
3155                                                break;
3156 
3157                 case Op::store16: if (scalar) { a->vpextrw  (arg[immy], (A::Xmm)r[x], 0); }
3158                                   else        { a->vpackusdw(tmp(), r[x], r[x]);
3159                                                 a->vpermq   (tmp(), tmp(), 0xd8);
3160                                                 a->vmovups  (arg[immy], (A::Xmm)tmp()); }
3161                                                 break;
3162 
3163                 case Op::store32: if (scalar) { a->vmovd  (arg[immy], (A::Xmm)r[x]); }
3164                                   else        { a->vmovups(arg[immy],         r[x]); }
3165                                                 break;
3166 
3167                 case Op::load8:  if (scalar) {
3168                                      a->vpxor  (dst(), dst(), dst());
3169                                      a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), arg[immy], 0);
3170                                  } else {
3171                                      a->vpmovzxbd(dst(), arg[immy]);
3172                                  } break;
3173 
3174                 case Op::load16: if (scalar) {
3175                                      a->vpxor  (dst(), dst(), dst());
3176                                      a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), arg[immy], 0);
3177                                  } else {
3178                                      a->vpmovzxwd(dst(), arg[immy]);
3179                                  } break;
3180 
3181                 case Op::load32: if (scalar) { a->vmovd  ((A::Xmm)dst(), arg[immy]); }
3182                                  else        { a->vmovups(        dst(), arg[immy]); }
3183                                  break;
3184 
3185                 case Op::gather32:
3186                 if (scalar) {
3187                     auto base  = scratch,
3188                          index = scratch2;
3189                     // Our gather base pointer is immz bytes off of uniform immy.
3190                     a->movq(base, arg[immy], immz);
3191 
3192                     // Grab our index from lane 0 of the index argument.
3193                     a->vmovd_direct(index, (A::Xmm)r[x]);
3194 
3195                     // dst = *(base + 4*index)
3196                     a->vmovd((A::Xmm)dst(), A::FOUR, index, base);
3197                 } else {
3198                     // We may not let any of dst(), index, or mask use the same register,
3199                     // so we must allocate registers manually and very carefully.
3200 
3201                     // index is argument x and has already been maybe_recycle_register()'d,
3202                     // so we explicitly ignore its availability during this op.
3203                     A::Ymm index = r[x];
3204                     uint32_t avail_during_gather = avail & ~(1<<index);
3205 
3206                     // Choose dst() to not overlap with index.
3207                     if (int found = __builtin_ffs(avail_during_gather)) {
3208                         set_dst((A::Ymm)(found-1));
3209                         avail_during_gather ^= (1<<dst());
3210                     } else {
3211                         ok = false;
3212                         break;
3213                     }
3214 
3215                     // Choose (temporary) mask to not overlap with dst() or index.
3216                     A::Ymm mask;
3217                     if (int found = __builtin_ffs(avail_during_gather)) {
3218                         mask = (A::Ymm)(found-1);
3219                     } else {
3220                         ok = false;
3221                         break;
3222                     }
3223 
3224                     // Our gather base pointer is immz bytes off of uniform immy.
3225                     auto base = scratch;
3226                     a->movq(base, arg[immy], immz);
3227                     a->vpcmpeqd(mask, mask, mask);   // (All lanes enabled.)
3228                     a->vgatherdps(dst(), A::FOUR, index, base, mask);
3229                 }
3230                 break;
3231 
3232                 case Op::uniform8: a->movzbl(scratch, arg[immy], immz);
3233                                    a->vmovd_direct((A::Xmm)dst(), scratch);
3234                                    a->vbroadcastss(dst(), (A::Xmm)dst());
3235                                    break;
3236 
3237                 case Op::uniform32: a->vbroadcastss(dst(), arg[immy], immz);
3238                                     break;
3239 
3240                 case Op::index: a->vmovd_direct((A::Xmm)tmp(), N);
3241                                 a->vbroadcastss(tmp(), (A::Xmm)tmp());
3242                                 a->vpsubd(dst(), tmp(), &iota.label);
3243                                 break;
3244 
3245                 case Op::splat: if (immy) { a->vbroadcastss(dst(), &constants[immy].label); }
3246                                 else      { a->vpxor(dst(), dst(), dst()); }
3247                                 break;
3248 
3249                 case Op::add_f32: a->vaddps(dst(), r[x], r[y]); break;
3250                 case Op::sub_f32: a->vsubps(dst(), r[x], r[y]); break;
3251                 case Op::mul_f32: a->vmulps(dst(), r[x], r[y]); break;
3252                 case Op::div_f32: a->vdivps(dst(), r[x], r[y]); break;
3253                 case Op::min_f32: a->vminps(dst(), r[x], r[y]); break;
3254                 case Op::max_f32: a->vmaxps(dst(), r[x], r[y]); break;
3255 
3256                 case Op::fma_f32:
3257                     if      (avail & (1<<r[x])) { set_dst(r[x]); a->vfmadd132ps(r[x], r[z], r[y]); }
3258                     else if (avail & (1<<r[y])) { set_dst(r[y]); a->vfmadd213ps(r[y], r[x], r[z]); }
3259                     else if (avail & (1<<r[z])) { set_dst(r[z]); a->vfmadd231ps(r[z], r[x], r[y]); }
3260                     else                        {                SkASSERT(dst() == tmp());
3261                                                                  a->vmovdqa    (dst(),r[x]);
3262                                                                  a->vfmadd132ps(dst(),r[z], r[y]); }
3263                                                                  break;
3264 
3265                 case Op::fms_f32:
3266                     if      (avail & (1<<r[x])) { set_dst(r[x]); a->vfmsub132ps(r[x], r[z], r[y]); }
3267                     else if (avail & (1<<r[y])) { set_dst(r[y]); a->vfmsub213ps(r[y], r[x], r[z]); }
3268                     else if (avail & (1<<r[z])) { set_dst(r[z]); a->vfmsub231ps(r[z], r[x], r[y]); }
3269                     else                        {                SkASSERT(dst() == tmp());
3270                                                                  a->vmovdqa    (dst(),r[x]);
3271                                                                  a->vfmsub132ps(dst(),r[z], r[y]); }
3272                                                                  break;
3273 
3274                 case Op::fnma_f32:
3275                     if      (avail & (1<<r[x])) { set_dst(r[x]); a->vfnmadd132ps(r[x],r[z], r[y]); }
3276                     else if (avail & (1<<r[y])) { set_dst(r[y]); a->vfnmadd213ps(r[y],r[x], r[z]); }
3277                     else if (avail & (1<<r[z])) { set_dst(r[z]); a->vfnmadd231ps(r[z],r[x], r[y]); }
3278                     else                        {                SkASSERT(dst() == tmp());
3279                                                                  a->vmovdqa    (dst(),r[x]);
3280                                                                  a->vfnmadd132ps(dst(),r[z],r[y]); }
3281                                                                  break;
3282 
3283                 case Op::sqrt_f32: a->vsqrtps(dst(), r[x]); break;
3284 
3285                 case Op::add_f32_imm: a->vaddps(dst(), r[x], &constants[immy].label); break;
3286                 case Op::sub_f32_imm: a->vsubps(dst(), r[x], &constants[immy].label); break;
3287                 case Op::mul_f32_imm: a->vmulps(dst(), r[x], &constants[immy].label); break;
3288                 case Op::min_f32_imm: a->vminps(dst(), r[x], &constants[immy].label); break;
3289                 case Op::max_f32_imm: a->vmaxps(dst(), r[x], &constants[immy].label); break;
3290 
3291                 case Op::add_i32: a->vpaddd (dst(), r[x], r[y]); break;
3292                 case Op::sub_i32: a->vpsubd (dst(), r[x], r[y]); break;
3293                 case Op::mul_i32: a->vpmulld(dst(), r[x], r[y]); break;
3294 
3295                 case Op::sub_i16x2: a->vpsubw (dst(), r[x], r[y]); break;
3296                 case Op::mul_i16x2: a->vpmullw(dst(), r[x], r[y]); break;
3297                 case Op::shr_i16x2: a->vpsrlw (dst(), r[x], immy); break;
3298 
3299                 case Op::bit_and  : a->vpand (dst(), r[x], r[y]); break;
3300                 case Op::bit_or   : a->vpor  (dst(), r[x], r[y]); break;
3301                 case Op::bit_xor  : a->vpxor (dst(), r[x], r[y]); break;
3302                 case Op::bit_clear: a->vpandn(dst(), r[y], r[x]); break;  // N.B. Y then X.
3303                 case Op::select   : a->vpblendvb(dst(), r[z], r[y], r[x]); break;
3304 
3305                 case Op::bit_and_imm: a->vpand (dst(), r[x], &constants[immy].label); break;
3306                 case Op::bit_or_imm : a->vpor  (dst(), r[x], &constants[immy].label); break;
3307                 case Op::bit_xor_imm: a->vpxor (dst(), r[x], &constants[immy].label); break;
3308 
3309                 case Op::shl_i32: a->vpslld(dst(), r[x], immy); break;
3310                 case Op::shr_i32: a->vpsrld(dst(), r[x], immy); break;
3311                 case Op::sra_i32: a->vpsrad(dst(), r[x], immy); break;
3312 
3313                 case Op::eq_i32: a->vpcmpeqd(dst(), r[x], r[y]); break;
3314                 case Op::gt_i32: a->vpcmpgtd(dst(), r[x], r[y]); break;
3315 
3316                 case Op:: eq_f32: a->vcmpeqps (dst(), r[x], r[y]); break;
3317                 case Op::neq_f32: a->vcmpneqps(dst(), r[x], r[y]); break;
3318                 case Op:: gt_f32: a->vcmpltps (dst(), r[y], r[x]); break;
3319                 case Op::gte_f32: a->vcmpleps (dst(), r[y], r[x]); break;
3320 
3321                 case Op::pack: a->vpslld(tmp(),  r[y], immz);
3322                                a->vpor  (dst(), tmp(), r[x]);
3323                                break;
3324 
3325                 case Op::floor : a->vroundps  (dst(), r[x], Assembler::FLOOR); break;
3326                 case Op::to_f32: a->vcvtdq2ps (dst(), r[x]); break;
3327                 case Op::trunc : a->vcvttps2dq(dst(), r[x]); break;
3328                 case Op::round : a->vcvtps2dq (dst(), r[x]); break;
3329 
3330                 case Op::bytes: a->vpshufb(dst(), r[x], &bytes_masks.find(immy)->label);
3331                                 break;
3332 
3333             #elif defined(__aarch64__)
3334                 case Op::assert_true: {
3335                     a->uminv4s(tmp(), r[x]);   // uminv acts like an all() across the vector.
3336                     a->fmovs(scratch, tmp());
3337                     A::Label all_true;
3338                     a->cbnz(scratch, &all_true);
3339                     a->brk(0);
3340                     a->label(&all_true);
3341                 } break;
3342 
3343                 case Op::store8: a->xtns2h(tmp(), r[x]);
3344                                  a->xtnh2b(tmp(), tmp());
3345                    if (scalar) { a->strb  (tmp(), arg[immy]); }
3346                    else        { a->strs  (tmp(), arg[immy]); }
3347                                  break;
3348                 // TODO: another case where it'd be okay to alias r[x] and tmp if r[x] dies here.
3349 
3350                 case Op::store32: if (scalar) { a->strs(r[x], arg[immy]); }
3351                                   else        { a->strq(r[x], arg[immy]); }
3352                                                 break;
3353 
3354                 case Op::load8: if (scalar) { a->ldrb(tmp(), arg[immy]); }
3355                                 else        { a->ldrs(tmp(), arg[immy]); }
3356                                               a->uxtlb2h(tmp(), tmp());
3357                                               a->uxtlh2s(dst(), tmp());
3358                                               break;
3359 
3360                 case Op::load32: if (scalar) { a->ldrs(dst(), arg[immy]); }
3361                                  else        { a->ldrq(dst(), arg[immy]); }
3362                                                break;
3363 
3364                 case Op::splat: if (immy) { a->ldrq(dst(), &constants[immy].label); }
3365                                 else      { a->eor16b(dst(), dst(), dst()); }
3366                                 break;
3367                                 // TODO: If we hoist these, pack 4 values in each register
3368                                 // and use vector/lane operations, cutting the register
3369                                 // pressure cost of hoisting by 4?
3370 
3371                 case Op::add_f32: a->fadd4s(dst(), r[x], r[y]); break;
3372                 case Op::sub_f32: a->fsub4s(dst(), r[x], r[y]); break;
3373                 case Op::mul_f32: a->fmul4s(dst(), r[x], r[y]); break;
3374                 case Op::div_f32: a->fdiv4s(dst(), r[x], r[y]); break;
3375                 case Op::min_f32: a->fmin4s(dst(), r[x], r[y]); break;
3376                 case Op::max_f32: a->fmax4s(dst(), r[x], r[y]); break;
3377 
3378                 case Op::fma_f32: // fmla.4s is z += x*y
3379                     if (avail & (1<<r[z])) { set_dst(r[z]); a->fmla4s( r[z],  r[x],  r[y]);   }
3380                     else {                                  a->orr16b(tmp(),  r[z],  r[z]);
3381                                                             a->fmla4s(tmp(),  r[x],  r[y]);
3382                                        if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
3383                                                             break;
3384 
3385                 case Op::fnma_f32:  // fmls.4s is z -= x*y
3386                     if (avail & (1<<r[z])) { set_dst(r[z]); a->fmls4s( r[z],  r[x],  r[y]);   }
3387                     else {                                  a->orr16b(tmp(),  r[z],  r[z]);
3388                                                             a->fmls4s(tmp(),  r[x],  r[y]);
3389                                        if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
3390                                                             break;
3391 
3392                 case Op::fms_f32:
3393                     // first dst() = xy - z as if fnma_f32
3394                     if (avail & (1<<r[z])) { set_dst(r[z]); a->fmls4s( r[z],  r[x],  r[y]);   }
3395                     else {                                  a->orr16b(tmp(),  r[z],  r[z]);
3396                                                             a->fmls4s(tmp(),  r[x],  r[y]);
3397                                        if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
3398                     // then dst() = -dst()  (i.e. z - xy)
3399                                                             a->fneg4s(dst(), dst());
3400                                                             break;
3401 
3402                 // These _imm instructions are all x86/JIT only.
3403                 case  Op::add_f32_imm :
3404                 case  Op::sub_f32_imm :
3405                 case  Op::mul_f32_imm :
3406                 case  Op::min_f32_imm :
3407                 case  Op::max_f32_imm :
3408                 case  Op::bit_and_imm :
3409                 case  Op::bit_or_imm  :
3410                 case  Op::bit_xor_imm : SkUNREACHABLE; break;
3411 
3412                 case Op:: gt_f32: a->fcmgt4s (dst(), r[x], r[y]); break;
3413                 case Op::gte_f32: a->fcmge4s (dst(), r[x], r[y]); break;
3414                 case Op:: eq_f32: a->fcmeq4s (dst(), r[x], r[y]); break;
3415                 case Op::neq_f32: a->fcmeq4s (tmp(), r[x], r[y]);
3416                                   a->not16b  (dst(), tmp());      break;
3417 
3418 
3419                 case Op::add_i32: a->add4s(dst(), r[x], r[y]); break;
3420                 case Op::sub_i32: a->sub4s(dst(), r[x], r[y]); break;
3421                 case Op::mul_i32: a->mul4s(dst(), r[x], r[y]); break;
3422 
3423                 case Op::sub_i16x2: a->sub8h (dst(), r[x], r[y]); break;
3424                 case Op::mul_i16x2: a->mul8h (dst(), r[x], r[y]); break;
3425                 case Op::shr_i16x2: a->ushr8h(dst(), r[x], immy); break;
3426 
3427                 case Op::bit_and  : a->and16b(dst(), r[x], r[y]); break;
3428                 case Op::bit_or   : a->orr16b(dst(), r[x], r[y]); break;
3429                 case Op::bit_xor  : a->eor16b(dst(), r[x], r[y]); break;
3430                 case Op::bit_clear: a->bic16b(dst(), r[x], r[y]); break;
3431 
3432                 case Op::select: // bsl16b is x = x ? y : z
3433                     if (avail & (1<<r[x])) { set_dst(r[x]); a->bsl16b( r[x],  r[y],  r[z]); }
3434                     else {                                  a->orr16b(tmp(),  r[x],  r[x]);
3435                                                             a->bsl16b(tmp(),  r[y],  r[z]);
3436                                        if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
3437                                                             break;
3438 
3439                 case Op::shl_i32: a-> shl4s(dst(), r[x], immy); break;
3440                 case Op::shr_i32: a->ushr4s(dst(), r[x], immy); break;
3441                 case Op::sra_i32: a->sshr4s(dst(), r[x], immy); break;
3442 
3443                 case Op::eq_i32: a->cmeq4s(dst(), r[x], r[y]); break;
3444                 case Op::gt_i32: a->cmgt4s(dst(), r[x], r[y]); break;
3445 
3446                 case Op::pack:
3447                     if (avail & (1<<r[x])) { set_dst(r[x]); a->sli4s ( r[x],  r[y],  immz); }
3448                     else                   {                a->shl4s (tmp(),  r[y],  immz);
3449                                                             a->orr16b(dst(), tmp(),  r[x]); }
3450                                                             break;
3451 
3452                 case Op::to_f32: a->scvtf4s (dst(), r[x]); break;
3453                 case Op::trunc:  a->fcvtzs4s(dst(), r[x]); break;
3454                 case Op::round:  a->fcvtns4s(dst(), r[x]); break;
3455                 // TODO: fcvtns.4s rounds to nearest even.
3456                 // I think we actually want frintx -> fcvtzs to round to current mode.
3457 
3458                 case Op::bytes:
3459                     if (try_hoisting) { a->tbl (dst(), r[x], bytes_masks.find(immy)->reg); }
3460                     else              { a->ldrq(tmp(), &bytes_masks.find(immy)->label);
3461                                         a->tbl (dst(), r[x], tmp()); }
3462                                         break;
3463             #endif
3464             }
3465 
3466             // Calls to tmp() or dst() might have flipped this false from its default true state.
3467             return ok;
3468         };
3469 
3470 
3471         #if defined(__x86_64__)
3472             const int K = 8;
3473             auto jump_if_less = [&](A::Label* l) { a->jl (l); };
3474             auto jump         = [&](A::Label* l) { a->jmp(l); };
3475 
3476             auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); };
3477             auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); };
3478 
3479             auto exit = [&]{ a->vzeroupper(); a->ret(); };
3480         #elif defined(__aarch64__)
3481             const int K = 4;
3482             auto jump_if_less = [&](A::Label* l) { a->blt(l); };
3483             auto jump         = [&](A::Label* l) { a->b  (l); };
3484 
3485             auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); };
3486             auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); };
3487 
3488             auto exit = [&]{ a->ret(A::x30); };
3489         #endif
3490 
3491         A::Label body,
3492                  tail,
3493                  done;
3494 
3495         for (Val id = 0; id < (Val)instructions.size(); id++) {
3496             if (!warmup(id)) {
3497                 return false;
3498             }
3499             if (hoisted(id) && !emit(id, /*scalar=*/false)) {
3500                 return false;
3501             }
3502         }
3503 
3504         a->label(&body);
3505         {
3506             a->cmp(N, K);
3507             jump_if_less(&tail);
3508             for (Val id = 0; id < (Val)instructions.size(); id++) {
3509                 if (!hoisted(id) && !emit(id, /*scalar=*/false)) {
3510                     return false;
3511                 }
3512             }
3513             for (int i = 0; i < (int)fImpl->strides.size(); i++) {
3514                 if (fImpl->strides[i]) {
3515                     add(arg[i], K*fImpl->strides[i]);
3516                 }
3517             }
3518             sub(N, K);
3519             jump(&body);
3520         }
3521 
3522         a->label(&tail);
3523         {
3524             a->cmp(N, 1);
3525             jump_if_less(&done);
3526             for (Val id = 0; id < (Val)instructions.size(); id++) {
3527                 if (!hoisted(id) && !emit(id, /*scalar=*/true)) {
3528                     return false;
3529                 }
3530             }
3531             for (int i = 0; i < (int)fImpl->strides.size(); i++) {
3532                 if (fImpl->strides[i]) {
3533                     add(arg[i], 1*fImpl->strides[i]);
3534                 }
3535             }
3536             sub(N, 1);
3537             jump(&tail);
3538         }
3539 
3540         a->label(&done);
3541         {
3542             exit();
3543         }
3544 
3545         // Except for explicit aligned load and store instructions, AVX allows
3546         // memory operands to be unaligned.  So even though we're creating 16
3547         // byte patterns on ARM or 32-byte patterns on x86, we only need to
3548         // align to 4 bytes, the element size and alignment requirement.
3549 
3550         constants.foreach([&](int imm, LabelAndReg* entry) {
3551             a->align(4);
3552             a->label(&entry->label);
3553             for (int i = 0; i < K; i++) {
3554                 a->word(imm);
3555             }
3556         });
3557 
3558         bytes_masks.foreach([&](int imm, LabelAndReg* entry) {
3559             // One 16-byte pattern for ARM tbl, that same pattern twice for x86-64 vpshufb.
3560             a->align(4);
3561             a->label(&entry->label);
3562             int mask[4];
3563             bytes_control(imm, mask);
3564             a->bytes(mask, sizeof(mask));
3565         #if defined(__x86_64__)
3566             a->bytes(mask, sizeof(mask));
3567         #endif
3568         });
3569 
3570         if (!iota.label.references.empty()) {
3571             a->align(4);
3572             a->label(&iota.label);
3573             for (int i = 0; i < K; i++) {
3574                 a->word(i);
3575             }
3576         }
3577 
3578         return true;
3579     }
3580 
setupJIT(const std::vector<OptimizedInstruction> & instructions,const char * debug_name)3581     void Program::setupJIT(const std::vector<OptimizedInstruction>& instructions,
3582                            const char* debug_name) {
3583         // Assemble with no buffer to determine a.size(), the number of bytes we'll assemble.
3584         Assembler a{nullptr};
3585 
3586         // First try allowing code hoisting (faster code)
3587         // then again without if that fails (lower register pressure).
3588         bool try_hoisting = true;
3589         if (!this->jit(instructions, try_hoisting, &a)) {
3590             try_hoisting = false;
3591             if (!this->jit(instructions, try_hoisting, &a)) {
3592                 return;
3593             }
3594         }
3595 
3596         // Allocate space that we can remap as executable.
3597         const size_t page = sysconf(_SC_PAGESIZE);
3598 
3599         // mprotect works at page granularity.
3600         fImpl->jit_size = ((a.size() + page - 1) / page) * page;
3601 
3602         void* jit_entry
3603              = mmap(nullptr,fImpl->jit_size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
3604         fImpl->jit_entry.store(jit_entry);
3605 
3606         // Assemble the program for real.
3607         a = Assembler{jit_entry};
3608         SkAssertResult(this->jit(instructions, try_hoisting, &a));
3609         SkASSERT(a.size() <= fImpl->jit_size);
3610 
3611         // Remap as executable, and flush caches on platforms that need that.
3612         mprotect(jit_entry, fImpl->jit_size, PROT_READ|PROT_EXEC);
3613         __builtin___clear_cache((char*)jit_entry,
3614                                 (char*)jit_entry + fImpl->jit_size);
3615 
3616         // For profiling and debugging, it's helpful to have this code loaded
3617         // dynamically rather than just jumping info fImpl->jit_entry.
3618         if (gSkVMJITViaDylib) {
3619             // Dump the raw program binary.
3620             SkString path = SkStringPrintf("/tmp/%s.XXXXXX", debug_name);
3621             int fd = mkstemp(path.writable_str());
3622             ::write(fd, jit_entry, a.size());
3623             close(fd);
3624 
3625             this->dropJIT();  // (unmap and null out fImpl->jit_entry.)
3626 
3627             // Convert it in-place to a dynamic library with a single symbol "skvm_jit":
3628             SkString cmd = SkStringPrintf(
3629                     "echo '.global _skvm_jit\n_skvm_jit: .incbin \"%s\"'"
3630                     " | clang -x assembler -shared - -o %s",
3631                     path.c_str(), path.c_str());
3632             system(cmd.c_str());
3633 
3634             // Load that dynamic library and look up skvm_jit().
3635             fImpl->dylib = dlopen(path.c_str(), RTLD_NOW|RTLD_LOCAL);
3636             fImpl->jit_entry.store(dlsym(fImpl->dylib, "skvm_jit"));
3637         }
3638     }
3639 #endif
3640 
3641 }  // namespace skvm
3642