1 /*
2  * Copyright 2019 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkStream.h"
9 #include "include/private/SkSpinlock.h"
10 #include "include/private/SkTFitsIn.h"
11 #include "include/private/SkThreadID.h"
12 #include "include/private/SkVx.h"
13 #include "src/core/SkCpu.h"
14 #include "src/core/SkVM.h"
15 #include <string.h>
16 #if defined(SKVM_JIT)
17     #include <sys/mman.h>
18 #endif
19 #if defined(SKVM_PERF_DUMPS)
20     #include <stdio.h>
21     #include <time.h>
22 #endif
23 
24 
25 namespace skvm {
26 
27     // Debugging tools, mostly for printing various data structures out to a stream.
28 
29     namespace {
30         class SkDebugfStream final : public SkWStream {
31             size_t fBytesWritten = 0;
32 
write(const void * buffer,size_t size)33             bool write(const void* buffer, size_t size) override {
34                 SkDebugf("%.*s", size, buffer);
35                 fBytesWritten += size;
36                 return true;
37             }
38 
bytesWritten() const39             size_t bytesWritten() const override {
40                 return fBytesWritten;
41             }
42         };
43 
44         struct V { Val id; };
45         struct R { Reg id; };
46         struct Shift { int bits; };
47         struct Splat { int bits; };
48         struct Hex   { int bits; };
49 
write(SkWStream * o,const char * s)50         static void write(SkWStream* o, const char* s) {
51             o->writeText(s);
52         }
53 
write(SkWStream * o,Arg a)54         static void write(SkWStream* o, Arg a) {
55             write(o, "arg(");
56             o->writeDecAsText(a.ix);
57             write(o, ")");
58         }
write(SkWStream * o,V v)59         static void write(SkWStream* o, V v) {
60             write(o, "v");
61             o->writeDecAsText(v.id);
62         }
write(SkWStream * o,R r)63         static void write(SkWStream* o, R r) {
64             write(o, "r");
65             o->writeDecAsText(r.id);
66         }
write(SkWStream * o,Shift s)67         static void write(SkWStream* o, Shift s) {
68             o->writeDecAsText(s.bits);
69         }
write(SkWStream * o,Splat s)70         static void write(SkWStream* o, Splat s) {
71             float f;
72             memcpy(&f, &s.bits, 4);
73             o->writeHexAsText(s.bits);
74             write(o, " (");
75             o->writeScalarAsText(f);
76             write(o, ")");
77         }
write(SkWStream * o,Hex h)78         static void write(SkWStream* o, Hex h) {
79             o->writeHexAsText(h.bits);
80         }
81 
82         template <typename T, typename... Ts>
write(SkWStream * o,T first,Ts...rest)83         static void write(SkWStream* o, T first, Ts... rest) {
84             write(o, first);
85             write(o, " ");
86             write(o, rest...);
87         }
88     }
89 
dump_builder_program(const std::vector<Builder::Instruction> & program,SkWStream * o)90     static void dump_builder_program(const std::vector<Builder::Instruction>& program,
91                                      SkWStream* o) {
92         for (Val id = 0; id < (Val)program.size(); id++) {
93             const Builder::Instruction& inst = program[id];
94             Op  op = inst.op;
95             Val  x = inst.x,
96                  y = inst.y,
97                  z = inst.z;
98             int imm = inst.imm;
99             write(o,  inst.death == 0   ? "☠️ " :
100                      !inst.can_hoist    ? "  " :
101                       inst.used_in_loop ? "↑ " :
102                                           "↟ ");
103             switch (op) {
104                 case Op::store8:  write(o, "store8" , Arg{imm}, V{x}); break;
105                 case Op::store16: write(o, "store16", Arg{imm}, V{x}); break;
106                 case Op::store32: write(o, "store32", Arg{imm}, V{x}); break;
107 
108                 case Op::load8:  write(o, V{id}, "= load8" , Arg{imm}); break;
109                 case Op::load16: write(o, V{id}, "= load16", Arg{imm}); break;
110                 case Op::load32: write(o, V{id}, "= load32", Arg{imm}); break;
111 
112                 case Op::gather8:  write(o, V{id}, "= gather8" , Arg{imm}, V{x}); break;
113                 case Op::gather16: write(o, V{id}, "= gather16", Arg{imm}, V{x}); break;
114                 case Op::gather32: write(o, V{id}, "= gather32", Arg{imm}, V{x}); break;
115 
116                 case Op::uniform8:  write(o, V{id}, "= uniform8" , Arg{imm & 0xffff}, Hex{imm>>16}); break;
117                 case Op::uniform16: write(o, V{id}, "= uniform16", Arg{imm & 0xffff}, Hex{imm>>16}); break;
118                 case Op::uniform32: write(o, V{id}, "= uniform32", Arg{imm & 0xffff}, Hex{imm>>16}); break;
119 
120                 case Op::splat:  write(o, V{id}, "= splat", Splat{imm}); break;
121 
122 
123                 case Op::add_f32: write(o, V{id}, "= add_f32", V{x}, V{y}      ); break;
124                 case Op::sub_f32: write(o, V{id}, "= sub_f32", V{x}, V{y}      ); break;
125                 case Op::mul_f32: write(o, V{id}, "= mul_f32", V{x}, V{y}      ); break;
126                 case Op::div_f32: write(o, V{id}, "= div_f32", V{x}, V{y}      ); break;
127                 case Op::mad_f32: write(o, V{id}, "= mad_f32", V{x}, V{y}, V{z}); break;
128 
129                 case Op:: eq_f32: write(o, V{id}, "= eq_f32", V{x}, V{y}); break;
130                 case Op::neq_f32: write(o, V{id}, "= neq_f32", V{x}, V{y}); break;
131                 case Op:: lt_f32: write(o, V{id}, "= lt_f32", V{x}, V{y}); break;
132                 case Op::lte_f32: write(o, V{id}, "= lte_f32", V{x}, V{y}); break;
133                 case Op:: gt_f32: write(o, V{id}, "= gt_f32", V{x}, V{y}); break;
134                 case Op::gte_f32: write(o, V{id}, "= gte_f32", V{x}, V{y}); break;
135 
136 
137                 case Op::add_i32: write(o, V{id}, "= add_i32", V{x}, V{y}); break;
138                 case Op::sub_i32: write(o, V{id}, "= sub_i32", V{x}, V{y}); break;
139                 case Op::mul_i32: write(o, V{id}, "= mul_i32", V{x}, V{y}); break;
140 
141                 case Op::shl_i32: write(o, V{id}, "= shl_i32", V{x}, Shift{imm}); break;
142                 case Op::shr_i32: write(o, V{id}, "= shr_i32", V{x}, Shift{imm}); break;
143                 case Op::sra_i32: write(o, V{id}, "= sra_i32", V{x}, Shift{imm}); break;
144 
145                 case Op:: eq_i32: write(o, V{id}, "= eq_i32", V{x}, V{y}); break;
146                 case Op::neq_i32: write(o, V{id}, "= neq_i32", V{x}, V{y}); break;
147                 case Op:: lt_i32: write(o, V{id}, "= lt_i32", V{x}, V{y}); break;
148                 case Op::lte_i32: write(o, V{id}, "= lte_i32", V{x}, V{y}); break;
149                 case Op:: gt_i32: write(o, V{id}, "= gt_i32", V{x}, V{y}); break;
150                 case Op::gte_i32: write(o, V{id}, "= gte_i32", V{x}, V{y}); break;
151 
152                 case Op::add_i16x2: write(o, V{id}, "= add_i16x2", V{x}, V{y}); break;
153                 case Op::sub_i16x2: write(o, V{id}, "= sub_i16x2", V{x}, V{y}); break;
154                 case Op::mul_i16x2: write(o, V{id}, "= mul_i16x2", V{x}, V{y}); break;
155 
156                 case Op::shl_i16x2: write(o, V{id}, "= shl_i16x2", V{x}, Shift{imm}); break;
157                 case Op::shr_i16x2: write(o, V{id}, "= shr_i16x2", V{x}, Shift{imm}); break;
158                 case Op::sra_i16x2: write(o, V{id}, "= sra_i16x2", V{x}, Shift{imm}); break;
159 
160                 case Op:: eq_i16x2: write(o, V{id}, "= eq_i16x2", V{x}, V{y}); break;
161                 case Op::neq_i16x2: write(o, V{id}, "= neq_i16x2", V{x}, V{y}); break;
162                 case Op:: lt_i16x2: write(o, V{id}, "= lt_i16x2", V{x}, V{y}); break;
163                 case Op::lte_i16x2: write(o, V{id}, "= lte_i16x2", V{x}, V{y}); break;
164                 case Op:: gt_i16x2: write(o, V{id}, "= gt_i16x2", V{x}, V{y}); break;
165                 case Op::gte_i16x2: write(o, V{id}, "= gte_i16x2", V{x}, V{y}); break;
166 
167                 case Op::bit_and  : write(o, V{id}, "= bit_and"  , V{x}, V{y}      ); break;
168                 case Op::bit_or   : write(o, V{id}, "= bit_or"   , V{x}, V{y}      ); break;
169                 case Op::bit_xor  : write(o, V{id}, "= bit_xor"  , V{x}, V{y}      ); break;
170                 case Op::bit_clear: write(o, V{id}, "= bit_clear", V{x}, V{y}      ); break;
171                 case Op::select   : write(o, V{id}, "= select"   , V{x}, V{y}, V{z}); break;
172 
173                 case Op::bytes:   write(o, V{id}, "= bytes",   V{x}, Hex{imm}); break;
174                 case Op::extract: write(o, V{id}, "= extract", V{x}, Shift{imm}, V{y}); break;
175                 case Op::pack:    write(o, V{id}, "= pack",    V{x}, V{y}, Shift{imm}); break;
176 
177                 case Op::to_f32: write(o, V{id}, "= to_f32", V{x}); break;
178                 case Op::to_i32: write(o, V{id}, "= to_i32", V{x}); break;
179             }
180 
181             write(o, "\n");
182         }
183     }
184 
dump(SkWStream * o) const185     void Builder::dump(SkWStream* o) const {
186         SkDebugfStream debug;
187         if (!o) { o = &debug; }
188 
189         o->writeDecAsText(fProgram.size());
190         o->writeText(" values:\n");
191         dump_builder_program(fProgram, o);
192     }
193 
dump(SkWStream * o) const194     void Program::dump(SkWStream* o) const {
195         SkDebugfStream debug;
196         if (!o) { o = &debug; }
197 
198         o->writeDecAsText(fRegs);
199         o->writeText(" registers, ");
200         o->writeDecAsText(fInstructions.size());
201         o->writeText(" instructions:\n");
202         for (int i = 0; i < (int)fInstructions.size(); i++) {
203             if (i == fLoop) {
204                 write(o, "loop:\n");
205             }
206             const Program::Instruction& inst = fInstructions[i];
207             Op   op = inst.op;
208             Reg   d = inst.d,
209                   x = inst.x,
210                   y = inst.y,
211                   z = inst.z;
212             int imm = inst.imm;
213             switch (op) {
214                 case Op::store8:  write(o, "store8" , Arg{imm}, R{x}); break;
215                 case Op::store16: write(o, "store16", Arg{imm}, R{x}); break;
216                 case Op::store32: write(o, "store32", Arg{imm}, R{x}); break;
217 
218                 case Op::load8:  write(o, R{d}, "= load8" , Arg{imm}); break;
219                 case Op::load16: write(o, R{d}, "= load16", Arg{imm}); break;
220                 case Op::load32: write(o, R{d}, "= load32", Arg{imm}); break;
221 
222                 case Op::gather8:  write(o, R{d}, "= gather8" , Arg{imm}, R{x}); break;
223                 case Op::gather16: write(o, R{d}, "= gather16", Arg{imm}, R{x}); break;
224                 case Op::gather32: write(o, R{d}, "= gather32", Arg{imm}, R{x}); break;
225 
226                 case Op::uniform8:  write(o, R{d}, "= uniform8" , Arg{imm & 0xffff}, Hex{imm>>16}); break;
227                 case Op::uniform16: write(o, R{d}, "= uniform16", Arg{imm & 0xffff}, Hex{imm>>16}); break;
228                 case Op::uniform32: write(o, R{d}, "= uniform32", Arg{imm & 0xffff}, Hex{imm>>16}); break;
229 
230                 case Op::splat:  write(o, R{d}, "= splat", Splat{imm}); break;
231 
232 
233                 case Op::add_f32: write(o, R{d}, "= add_f32", R{x}, R{y}      ); break;
234                 case Op::sub_f32: write(o, R{d}, "= sub_f32", R{x}, R{y}      ); break;
235                 case Op::mul_f32: write(o, R{d}, "= mul_f32", R{x}, R{y}      ); break;
236                 case Op::div_f32: write(o, R{d}, "= div_f32", R{x}, R{y}      ); break;
237                 case Op::mad_f32: write(o, R{d}, "= mad_f32", R{x}, R{y}, R{z}); break;
238 
239                 case Op:: eq_f32: write(o, R{d}, "= eq_f32", R{x}, R{y}); break;
240                 case Op::neq_f32: write(o, R{d}, "= neq_f32", R{x}, R{y}); break;
241                 case Op:: lt_f32: write(o, R{d}, "= lt_f32", R{x}, R{y}); break;
242                 case Op::lte_f32: write(o, R{d}, "= lte_f32", R{x}, R{y}); break;
243                 case Op:: gt_f32: write(o, R{d}, "= gt_f32", R{x}, R{y}); break;
244                 case Op::gte_f32: write(o, R{d}, "= gte_f32", R{x}, R{y}); break;
245 
246 
247                 case Op::add_i32: write(o, R{d}, "= add_i32", R{x}, R{y}); break;
248                 case Op::sub_i32: write(o, R{d}, "= sub_i32", R{x}, R{y}); break;
249                 case Op::mul_i32: write(o, R{d}, "= mul_i32", R{x}, R{y}); break;
250 
251                 case Op::shl_i32: write(o, R{d}, "= shl_i32", R{x}, Shift{imm}); break;
252                 case Op::shr_i32: write(o, R{d}, "= shr_i32", R{x}, Shift{imm}); break;
253                 case Op::sra_i32: write(o, R{d}, "= sra_i32", R{x}, Shift{imm}); break;
254 
255                 case Op:: eq_i32: write(o, R{d}, "= eq_i32", R{x}, R{y}); break;
256                 case Op::neq_i32: write(o, R{d}, "= neq_i32", R{x}, R{y}); break;
257                 case Op:: lt_i32: write(o, R{d}, "= lt_i32", R{x}, R{y}); break;
258                 case Op::lte_i32: write(o, R{d}, "= lte_i32", R{x}, R{y}); break;
259                 case Op:: gt_i32: write(o, R{d}, "= gt_i32", R{x}, R{y}); break;
260                 case Op::gte_i32: write(o, R{d}, "= gte_i32", R{x}, R{y}); break;
261 
262 
263                 case Op::add_i16x2: write(o, R{d}, "= add_i16x2", R{x}, R{y}); break;
264                 case Op::sub_i16x2: write(o, R{d}, "= sub_i16x2", R{x}, R{y}); break;
265                 case Op::mul_i16x2: write(o, R{d}, "= mul_i16x2", R{x}, R{y}); break;
266 
267                 case Op::shl_i16x2: write(o, R{d}, "= shl_i16x2", R{x}, Shift{imm}); break;
268                 case Op::shr_i16x2: write(o, R{d}, "= shr_i16x2", R{x}, Shift{imm}); break;
269                 case Op::sra_i16x2: write(o, R{d}, "= sra_i16x2", R{x}, Shift{imm}); break;
270 
271                 case Op:: eq_i16x2: write(o, R{d}, "= eq_i16x2", R{x}, R{y}); break;
272                 case Op::neq_i16x2: write(o, R{d}, "= neq_i16x2", R{x}, R{y}); break;
273                 case Op:: lt_i16x2: write(o, R{d}, "= lt_i16x2", R{x}, R{y}); break;
274                 case Op::lte_i16x2: write(o, R{d}, "= lte_i16x2", R{x}, R{y}); break;
275                 case Op:: gt_i16x2: write(o, R{d}, "= gt_i16x2", R{x}, R{y}); break;
276                 case Op::gte_i16x2: write(o, R{d}, "= gte_i16x2", R{x}, R{y}); break;
277 
278 
279                 case Op::bit_and  : write(o, R{d}, "= bit_and"  , R{x}, R{y}      ); break;
280                 case Op::bit_or   : write(o, R{d}, "= bit_or"   , R{x}, R{y}      ); break;
281                 case Op::bit_xor  : write(o, R{d}, "= bit_xor"  , R{x}, R{y}      ); break;
282                 case Op::bit_clear: write(o, R{d}, "= bit_clear", R{x}, R{y}      ); break;
283                 case Op::select   : write(o, R{d}, "= select"   , R{x}, R{y}, R{z}); break;
284 
285                 case Op::bytes:   write(o, R{d}, "= bytes", R{x}, Hex{imm}); break;
286                 case Op::extract: write(o, R{d}, "= extract", R{x}, Shift{imm}, R{y}); break;
287                 case Op::pack:    write(o, R{d}, "= pack",    R{x}, R{y}, Shift{imm}); break;
288 
289                 case Op::to_f32: write(o, R{d}, "= to_f32", R{x}); break;
290                 case Op::to_i32: write(o, R{d}, "= to_i32", R{x}); break;
291             }
292             write(o, "\n");
293         }
294     }
295 
296     // Builder -> Program, with liveness and loop hoisting analysis.
297 
done(const char * debug_name)298     Program Builder::done(const char* debug_name) {
299         // Basic liveness analysis:
300         // an instruction is live until all live instructions that need its input have retired.
301         for (Val id = fProgram.size(); id --> 0; ) {
302             Instruction& inst = fProgram[id];
303             // All side-effect-only instructions (stores) are live.
304             if (inst.op <= Op::store32) {
305                 inst.death = id;
306             }
307             // The arguments of a live instruction must live until at least that instruction.
308             if (inst.death != 0) {
309                 // Notice how we're walking backward, storing the latest instruction in death.
310                 if (inst.x != NA && fProgram[inst.x].death == 0) { fProgram[inst.x].death = id; }
311                 if (inst.y != NA && fProgram[inst.y].death == 0) { fProgram[inst.y].death = id; }
312                 if (inst.z != NA && fProgram[inst.z].death == 0) { fProgram[inst.z].death = id; }
313             }
314         }
315 
316         // Mark which values don't depend on the loop and can be hoisted.
317         for (Val id = 0; id < (Val)fProgram.size(); id++) {
318             Builder::Instruction& inst = fProgram[id];
319 
320             // Varying loads (and gathers) and stores cannot be hoisted out of the loop.
321             if (inst.op <= Op::gather32) {
322                 inst.can_hoist = false;
323             }
324 
325             // If any of an instruction's inputs can't be hoisted, it can't be hoisted itself.
326             if (inst.can_hoist) {
327                 if (inst.x != NA) { inst.can_hoist &= fProgram[inst.x].can_hoist; }
328                 if (inst.y != NA) { inst.can_hoist &= fProgram[inst.y].can_hoist; }
329                 if (inst.z != NA) { inst.can_hoist &= fProgram[inst.z].can_hoist; }
330             }
331 
332             // We'll want to know if hoisted values are used in the loop;
333             // if not, we can recycle their registers like we do loop values.
334             if (!inst.can_hoist /*i.e. we're in the loop, so the arguments are used_in_loop*/) {
335                 if (inst.x != NA) { fProgram[inst.x].used_in_loop = true; }
336                 if (inst.y != NA) { fProgram[inst.y].used_in_loop = true; }
337                 if (inst.z != NA) { fProgram[inst.z].used_in_loop = true; }
338             }
339         }
340 
341         return {fProgram, fStrides, debug_name};
342     }
343 
344     // TODO: it's probably not important that we include post-Builder::done() fields like
345     // death, can_hoist, and used_in_loop in operator==() and InstructionHash::operator().
346     // They'll always have the same, initial values as set in Builder::push().
347 
operator ==(const Builder::Instruction & a,const Builder::Instruction & b)348     static bool operator==(const Builder::Instruction& a, const Builder::Instruction& b) {
349         return a.op           == b.op
350             && a.x            == b.x
351             && a.y            == b.y
352             && a.z            == b.z
353             && a.imm          == b.imm
354             && a.death        == b.death
355             && a.can_hoist    == b.can_hoist
356             && a.used_in_loop == b.used_in_loop;
357     }
358 
359     // TODO: replace with SkOpts::hash()?
operator ()(const Instruction & inst) const360     size_t Builder::InstructionHash::operator()(const Instruction& inst) const {
361         return Hash((uint8_t)inst.op)
362             ^ Hash(inst.x)
363             ^ Hash(inst.y)
364             ^ Hash(inst.z)
365             ^ Hash(inst.imm)
366             ^ Hash(inst.death)
367             ^ Hash(inst.can_hoist)
368             ^ Hash(inst.used_in_loop);
369     }
370 
371     // Most instructions produce a value and return it by ID,
372     // the value-producing instruction's own index in the program vector.
push(Op op,Val x,Val y,Val z,int imm)373     Val Builder::push(Op op, Val x, Val y, Val z, int imm) {
374         Instruction inst{op, x, y, z, imm,
375                          /*death=*/0, /*can_hoist=*/true, /*used_in_loop=*/false};
376 
377         // Basic common subexpression elimination:
378         // if we've already seen this exact Instruction, use it instead of creating a new one.
379         if (Val* id = fIndex.find(inst)) {
380             return *id;
381         }
382         Val id = static_cast<Val>(fProgram.size());
383         fProgram.push_back(inst);
384         fIndex.set(inst, id);
385         return id;
386     }
387 
isZero(Val id) const388     bool Builder::isZero(Val id) const {
389         return fProgram[id].op  == Op::splat
390             && fProgram[id].imm == 0;
391     }
392 
arg(int stride)393     Arg Builder::arg(int stride) {
394         int ix = (int)fStrides.size();
395         fStrides.push_back(stride);
396         return {ix};
397     }
398 
store8(Arg ptr,I32 val)399     void Builder::store8 (Arg ptr, I32 val) { (void)this->push(Op::store8 , val.id,NA,NA, ptr.ix); }
store16(Arg ptr,I32 val)400     void Builder::store16(Arg ptr, I32 val) { (void)this->push(Op::store16, val.id,NA,NA, ptr.ix); }
store32(Arg ptr,I32 val)401     void Builder::store32(Arg ptr, I32 val) { (void)this->push(Op::store32, val.id,NA,NA, ptr.ix); }
402 
load8(Arg ptr)403     I32 Builder::load8 (Arg ptr) { return {this->push(Op::load8 , NA,NA,NA, ptr.ix) }; }
load16(Arg ptr)404     I32 Builder::load16(Arg ptr) { return {this->push(Op::load16, NA,NA,NA, ptr.ix) }; }
load32(Arg ptr)405     I32 Builder::load32(Arg ptr) { return {this->push(Op::load32, NA,NA,NA, ptr.ix) }; }
406 
gather8(Arg ptr,I32 offset)407     I32 Builder::gather8 (Arg ptr, I32 offset) {
408         return {this->push(Op::gather8 , offset.id,NA,NA, ptr.ix)};
409     }
gather16(Arg ptr,I32 offset)410     I32 Builder::gather16(Arg ptr, I32 offset) {
411         return {this->push(Op::gather16, offset.id,NA,NA, ptr.ix)};
412     }
gather32(Arg ptr,I32 offset)413     I32 Builder::gather32(Arg ptr, I32 offset) {
414         return {this->push(Op::gather32, offset.id,NA,NA, ptr.ix)};
415     }
416 
uniform8(Arg ptr,int offset)417     I32 Builder::uniform8(Arg ptr, int offset) {
418         return {this->push(Op::uniform8, NA,NA,NA, ptr.ix | (offset<<16))};
419     }
uniform16(Arg ptr,int offset)420     I32 Builder::uniform16(Arg ptr, int offset) {
421         return {this->push(Op::uniform16, NA,NA,NA, ptr.ix | (offset<<16))};
422     }
uniform32(Arg ptr,int offset)423     I32 Builder::uniform32(Arg ptr, int offset) {
424         return {this->push(Op::uniform32, NA,NA,NA, ptr.ix | (offset<<16))};
425     }
426 
427     // The two splat() functions are just syntax sugar over splatting a 4-byte bit pattern.
splat(int n)428     I32 Builder::splat(int   n) { return {this->push(Op::splat, NA,NA,NA, n) }; }
splat(float f)429     F32 Builder::splat(float f) {
430         int bits;
431         memcpy(&bits, &f, 4);
432         return {this->push(Op::splat, NA,NA,NA, bits)};
433     }
434 
add(F32 x,F32 y)435     F32 Builder::add(F32 x, F32 y       ) { return {this->push(Op::add_f32, x.id, y.id)}; }
sub(F32 x,F32 y)436     F32 Builder::sub(F32 x, F32 y       ) { return {this->push(Op::sub_f32, x.id, y.id)}; }
mul(F32 x,F32 y)437     F32 Builder::mul(F32 x, F32 y       ) { return {this->push(Op::mul_f32, x.id, y.id)}; }
div(F32 x,F32 y)438     F32 Builder::div(F32 x, F32 y       ) { return {this->push(Op::div_f32, x.id, y.id)}; }
mad(F32 x,F32 y,F32 z)439     F32 Builder::mad(F32 x, F32 y, F32 z) {
440         if (this->isZero(z.id)) {
441             return this->mul(x,y);
442         }
443         return {this->push(Op::mad_f32, x.id, y.id, z.id)};
444     }
445 
add(I32 x,I32 y)446     I32 Builder::add(I32 x, I32 y) { return {this->push(Op::add_i32, x.id, y.id)}; }
sub(I32 x,I32 y)447     I32 Builder::sub(I32 x, I32 y) { return {this->push(Op::sub_i32, x.id, y.id)}; }
mul(I32 x,I32 y)448     I32 Builder::mul(I32 x, I32 y) { return {this->push(Op::mul_i32, x.id, y.id)}; }
449 
add_16x2(I32 x,I32 y)450     I32 Builder::add_16x2(I32 x, I32 y) { return {this->push(Op::add_i16x2, x.id, y.id)}; }
sub_16x2(I32 x,I32 y)451     I32 Builder::sub_16x2(I32 x, I32 y) { return {this->push(Op::sub_i16x2, x.id, y.id)}; }
mul_16x2(I32 x,I32 y)452     I32 Builder::mul_16x2(I32 x, I32 y) { return {this->push(Op::mul_i16x2, x.id, y.id)}; }
453 
shl(I32 x,int bits)454     I32 Builder::shl(I32 x, int bits) { return {this->push(Op::shl_i32, x.id,NA,NA, bits)}; }
shr(I32 x,int bits)455     I32 Builder::shr(I32 x, int bits) { return {this->push(Op::shr_i32, x.id,NA,NA, bits)}; }
sra(I32 x,int bits)456     I32 Builder::sra(I32 x, int bits) { return {this->push(Op::sra_i32, x.id,NA,NA, bits)}; }
457 
shl_16x2(I32 x,int bits)458     I32 Builder::shl_16x2(I32 x, int bits) { return {this->push(Op::shl_i16x2, x.id,NA,NA, bits)}; }
shr_16x2(I32 x,int bits)459     I32 Builder::shr_16x2(I32 x, int bits) { return {this->push(Op::shr_i16x2, x.id,NA,NA, bits)}; }
sra_16x2(I32 x,int bits)460     I32 Builder::sra_16x2(I32 x, int bits) { return {this->push(Op::sra_i16x2, x.id,NA,NA, bits)}; }
461 
eq(F32 x,F32 y)462     I32 Builder:: eq(F32 x, F32 y) { return {this->push(Op:: eq_f32, x.id, y.id)}; }
neq(F32 x,F32 y)463     I32 Builder::neq(F32 x, F32 y) { return {this->push(Op::neq_f32, x.id, y.id)}; }
lt(F32 x,F32 y)464     I32 Builder:: lt(F32 x, F32 y) { return {this->push(Op:: lt_f32, x.id, y.id)}; }
lte(F32 x,F32 y)465     I32 Builder::lte(F32 x, F32 y) { return {this->push(Op::lte_f32, x.id, y.id)}; }
gt(F32 x,F32 y)466     I32 Builder:: gt(F32 x, F32 y) { return {this->push(Op:: gt_f32, x.id, y.id)}; }
gte(F32 x,F32 y)467     I32 Builder::gte(F32 x, F32 y) { return {this->push(Op::gte_f32, x.id, y.id)}; }
468 
eq(I32 x,I32 y)469     I32 Builder:: eq(I32 x, I32 y) { return {this->push(Op:: eq_i32, x.id, y.id)}; }
neq(I32 x,I32 y)470     I32 Builder::neq(I32 x, I32 y) { return {this->push(Op::neq_i32, x.id, y.id)}; }
lt(I32 x,I32 y)471     I32 Builder:: lt(I32 x, I32 y) { return {this->push(Op:: lt_i32, x.id, y.id)}; }
lte(I32 x,I32 y)472     I32 Builder::lte(I32 x, I32 y) { return {this->push(Op::lte_i32, x.id, y.id)}; }
gt(I32 x,I32 y)473     I32 Builder:: gt(I32 x, I32 y) { return {this->push(Op:: gt_i32, x.id, y.id)}; }
gte(I32 x,I32 y)474     I32 Builder::gte(I32 x, I32 y) { return {this->push(Op::gte_i32, x.id, y.id)}; }
475 
eq_16x2(I32 x,I32 y)476     I32 Builder:: eq_16x2(I32 x, I32 y) { return {this->push(Op:: eq_i16x2, x.id, y.id)}; }
neq_16x2(I32 x,I32 y)477     I32 Builder::neq_16x2(I32 x, I32 y) { return {this->push(Op::neq_i16x2, x.id, y.id)}; }
lt_16x2(I32 x,I32 y)478     I32 Builder:: lt_16x2(I32 x, I32 y) { return {this->push(Op:: lt_i16x2, x.id, y.id)}; }
lte_16x2(I32 x,I32 y)479     I32 Builder::lte_16x2(I32 x, I32 y) { return {this->push(Op::lte_i16x2, x.id, y.id)}; }
gt_16x2(I32 x,I32 y)480     I32 Builder:: gt_16x2(I32 x, I32 y) { return {this->push(Op:: gt_i16x2, x.id, y.id)}; }
gte_16x2(I32 x,I32 y)481     I32 Builder::gte_16x2(I32 x, I32 y) { return {this->push(Op::gte_i16x2, x.id, y.id)}; }
482 
bit_and(I32 x,I32 y)483     I32 Builder::bit_and  (I32 x, I32 y) { return {this->push(Op::bit_and  , x.id, y.id)}; }
bit_or(I32 x,I32 y)484     I32 Builder::bit_or   (I32 x, I32 y) { return {this->push(Op::bit_or   , x.id, y.id)}; }
bit_xor(I32 x,I32 y)485     I32 Builder::bit_xor  (I32 x, I32 y) { return {this->push(Op::bit_xor  , x.id, y.id)}; }
bit_clear(I32 x,I32 y)486     I32 Builder::bit_clear(I32 x, I32 y) { return {this->push(Op::bit_clear, x.id, y.id)}; }
select(I32 x,I32 y,I32 z)487     I32 Builder::select(I32 x, I32 y, I32 z) { return {this->push(Op::select, x.id, y.id, z.id)}; }
488 
489 
extract(I32 x,int bits,I32 y)490     I32 Builder::extract(I32 x, int bits, I32 y) {
491         return {this->push(Op::extract, x.id,y.id,NA, bits)};
492     }
493 
pack(I32 x,I32 y,int bits)494     I32 Builder::pack(I32 x, I32 y, int bits) {
495         return {this->push(Op::pack, x.id,y.id,NA, bits)};
496     }
497 
bytes(I32 x,int control)498     I32 Builder::bytes(I32 x, int control) {
499         return {this->push(Op::bytes, x.id,NA,NA, control)};
500     }
501 
to_f32(I32 x)502     F32 Builder::to_f32(I32 x) { return {this->push(Op::to_f32, x.id)}; }
to_i32(F32 x)503     I32 Builder::to_i32(F32 x) { return {this->push(Op::to_i32, x.id)}; }
504 
505     // ~~~~ Program::eval() and co. ~~~~ //
506 
507     // Handy references for x86-64 instruction encoding:
508     // https://wiki.osdev.org/X86-64_Instruction_Encoding
509     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x64.htm
510     // https://www-user.tu-chemnitz.de/~heha/viewchm.php/hs/x86.chm/x86.htm
511     // http://ref.x86asm.net/coder64.html
512 
513     // Used for ModRM / immediate instruction encoding.
_233(int a,int b,int c)514     static uint8_t _233(int a, int b, int c) {
515         return (a & 3) << 6
516              | (b & 7) << 3
517              | (c & 7) << 0;
518     }
519 
520     // ModRM byte encodes the arguments of an opcode.
521     enum class Mod { Indirect, OneByteImm, FourByteImm, Direct };
mod_rm(Mod mod,int reg,int rm)522     static uint8_t mod_rm(Mod mod, int reg, int rm) {
523         return _233((int)mod, reg, rm);
524     }
525 
mod(int imm)526     static Mod mod(int imm) {
527         if (imm == 0)               { return Mod::Indirect; }
528         if (SkTFitsIn<int8_t>(imm)) { return Mod::OneByteImm; }
529         return Mod::FourByteImm;
530     }
531 
imm_bytes(Mod mod)532     static int imm_bytes(Mod mod) {
533         switch (mod) {
534             case Mod::Indirect:    return 0;
535             case Mod::OneByteImm:  return 1;
536             case Mod::FourByteImm: return 4;
537             case Mod::Direct: SkUNREACHABLE;
538         }
539         SkUNREACHABLE;
540     }
541 
542 #if 0
543     // SIB byte encodes a memory address, base + (index * scale).
544     enum class Scale { One, Two, Four, Eight };
545     static uint8_t sib(Scale scale, int index, int base) {
546         return _233((int)scale, index, base);
547     }
548 #endif
549 
550     // The REX prefix is used to extend most old 32-bit instructions to 64-bit.
rex(bool W,bool R,bool X,bool B)551     static uint8_t rex(bool W,   // If set, operation is 64-bit, otherwise default, usually 32-bit.
552                        bool R,   // Extra top bit to select ModRM reg, registers 8-15.
553                        bool X,   // Extra top bit for SIB index register.
554                        bool B) { // Extra top bit for SIB base or ModRM rm register.
555         return 0b01000000   // Fixed 0100 for top four bits.
556              | (W << 3)
557              | (R << 2)
558              | (X << 1)
559              | (B << 0);
560     }
561 
562 
563     // The VEX prefix extends SSE operations to AVX.  Used generally, even with XMM.
564     struct VEX {
565         int     len;
566         uint8_t bytes[3];
567     };
568 
vex(bool WE,bool R,bool X,bool B,int map,int vvvv,bool L,int pp)569     static VEX vex(bool  WE,   // Like REX W for int operations, or opcode extension for float?
570                    bool   R,   // Same as REX R.  Pass high bit of dst register, dst>>3.
571                    bool   X,   // Same as REX X.
572                    bool   B,   // Same as REX B.  Pass y>>3 for 3-arg ops, x>>3 for 2-arg.
573                    int  map,   // SSE opcode map selector: 0x0f, 0x380f, 0x3a0f.
574                    int vvvv,   // 4-bit second operand register.  Pass our x for 3-arg ops.
575                    bool   L,   // Set for 256-bit ymm operations, off for 128-bit xmm.
576                    int   pp) { // SSE mandatory prefix: 0x66, 0xf3, 0xf2, else none.
577 
578         // Pack x86 opcode map selector to 5-bit VEX encoding.
579         map = [map]{
580             switch (map) {
581                 case   0x0f: return 0b00001;
582                 case 0x380f: return 0b00010;
583                 case 0x3a0f: return 0b00011;
584                 // Several more cases only used by XOP / TBM.
585             }
586             SkUNREACHABLE;
587         }();
588 
589         // Pack  mandatory SSE opcode prefix byte to 2-bit VEX encoding.
590         pp = [pp]{
591             switch (pp) {
592                 case 0x66: return 0b01;
593                 case 0xf3: return 0b10;
594                 case 0xf2: return 0b11;
595             }
596             return 0b00;
597         }();
598 
599         VEX vex = {0, {0,0,0}};
600         if (X == 0 && B == 0 && WE == 0 && map == 0b00001) {
601             // With these conditions met, we can optionally compress VEX to 2-byte.
602             vex.len = 2;
603             vex.bytes[0] = 0xc5;
604             vex.bytes[1] = (pp      &  3) << 0
605                          | (L       &  1) << 2
606                          | (~vvvv   & 15) << 3
607                          | (~(int)R &  1) << 7;
608         } else {
609             // We could use this 3-byte VEX prefix all the time if we like.
610             vex.len = 3;
611             vex.bytes[0] = 0xc4;
612             vex.bytes[1] = (map     & 31) << 0
613                          | (~(int)B &  1) << 5
614                          | (~(int)X &  1) << 6
615                          | (~(int)R &  1) << 7;
616             vex.bytes[2] = (pp    &  3) << 0
617                          | (L     &  1) << 2
618                          | (~vvvv & 15) << 3
619                          | (WE    &  1) << 7;
620         }
621         return vex;
622     }
623 
Assembler(void * buf)624     Assembler::Assembler(void* buf) : fCode((uint8_t*)buf), fCurr(fCode), fSize(0) {}
625 
size() const626     size_t Assembler::size() const { return fSize; }
627 
bytes(const void * p,int n)628     void Assembler::bytes(const void* p, int n) {
629         if (fCurr) {
630             memcpy(fCurr, p, n);
631             fCurr += n;
632         }
633         fSize += n;
634     }
635 
byte(uint8_t b)636     void Assembler::byte(uint8_t b) { this->bytes(&b, 1); }
word(uint32_t w)637     void Assembler::word(uint32_t w) { this->bytes(&w, 4); }
638 
align(int mod)639     void Assembler::align(int mod) {
640         while (this->size() % mod) {
641             this->byte(0x00);
642         }
643     }
644 
vzeroupper()645     void Assembler::vzeroupper() {
646         this->byte(0xc5);
647         this->byte(0xf8);
648         this->byte(0x77);
649     }
ret()650     void Assembler::ret() { this->byte(0xc3); }
651 
652     // Common instruction building for 64-bit opcodes with an immediate argument.
op(int opcode,int opcode_ext,GP64 dst,int imm)653     void Assembler::op(int opcode, int opcode_ext, GP64 dst, int imm) {
654         opcode |= 0b0000'0001;   // low bit set for 64-bit operands
655         opcode |= 0b1000'0000;   // top bit set for instructions with any immediate
656 
657         int imm_bytes = 4;
658         if (SkTFitsIn<int8_t>(imm)) {
659             imm_bytes = 1;
660             opcode |= 0b0000'0010;  // second bit set for 8-bit immediate, else 32-bit.
661         }
662 
663         this->byte(rex(1,0,0,dst>>3));
664         this->byte(opcode);
665         this->byte(mod_rm(Mod::Direct, opcode_ext, dst&7));
666         this->bytes(&imm, imm_bytes);
667     }
668 
add(GP64 dst,int imm)669     void Assembler::add(GP64 dst, int imm) { this->op(0,0b000, dst,imm); }
sub(GP64 dst,int imm)670     void Assembler::sub(GP64 dst, int imm) { this->op(0,0b101, dst,imm); }
cmp(GP64 reg,int imm)671     void Assembler::cmp(GP64 reg, int imm) { this->op(0,0b111, reg,imm); }
672 
op(int prefix,int map,int opcode,Ymm dst,Ymm x,Ymm y,bool W)673     void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Ymm y, bool W/*=false*/) {
674         VEX v = vex(W, dst>>3, 0, y>>3,
675                     map, x, 1/*ymm, not xmm*/, prefix);
676         this->bytes(v.bytes, v.len);
677         this->byte(opcode);
678         this->byte(mod_rm(Mod::Direct, dst&7, y&7));
679     }
680 
vpaddd(Ymm dst,Ymm x,Ymm y)681     void Assembler::vpaddd (Ymm dst, Ymm x, Ymm y) { this->op(0x66,  0x0f,0xfe, dst,x,y); }
vpsubd(Ymm dst,Ymm x,Ymm y)682     void Assembler::vpsubd (Ymm dst, Ymm x, Ymm y) { this->op(0x66,  0x0f,0xfa, dst,x,y); }
vpmulld(Ymm dst,Ymm x,Ymm y)683     void Assembler::vpmulld(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x40, dst,x,y); }
684 
vpsubw(Ymm dst,Ymm x,Ymm y)685     void Assembler::vpsubw (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xf9, dst,x,y); }
vpmullw(Ymm dst,Ymm x,Ymm y)686     void Assembler::vpmullw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xd5, dst,x,y); }
687 
vpand(Ymm dst,Ymm x,Ymm y)688     void Assembler::vpand (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xdb, dst,x,y); }
vpor(Ymm dst,Ymm x,Ymm y)689     void Assembler::vpor  (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xeb, dst,x,y); }
vpxor(Ymm dst,Ymm x,Ymm y)690     void Assembler::vpxor (Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xef, dst,x,y); }
vpandn(Ymm dst,Ymm x,Ymm y)691     void Assembler::vpandn(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0xdf, dst,x,y); }
692 
vaddps(Ymm dst,Ymm x,Ymm y)693     void Assembler::vaddps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x58, dst,x,y); }
vsubps(Ymm dst,Ymm x,Ymm y)694     void Assembler::vsubps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x5c, dst,x,y); }
vmulps(Ymm dst,Ymm x,Ymm y)695     void Assembler::vmulps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x59, dst,x,y); }
vdivps(Ymm dst,Ymm x,Ymm y)696     void Assembler::vdivps(Ymm dst, Ymm x, Ymm y) { this->op(0,0x0f,0x5e, dst,x,y); }
697 
vfmadd132ps(Ymm dst,Ymm x,Ymm y)698     void Assembler::vfmadd132ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x98, dst,x,y); }
vfmadd213ps(Ymm dst,Ymm x,Ymm y)699     void Assembler::vfmadd213ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xa8, dst,x,y); }
vfmadd231ps(Ymm dst,Ymm x,Ymm y)700     void Assembler::vfmadd231ps(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0xb8, dst,x,y); }
701 
vpackusdw(Ymm dst,Ymm x,Ymm y)702     void Assembler::vpackusdw(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x380f,0x2b, dst,x,y); }
vpackuswb(Ymm dst,Ymm x,Ymm y)703     void Assembler::vpackuswb(Ymm dst, Ymm x, Ymm y) { this->op(0x66,  0x0f,0x67, dst,x,y); }
704 
vpcmpeqd(Ymm dst,Ymm x,Ymm y)705     void Assembler::vpcmpeqd(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0x76, dst,x,y); }
vpcmpgtd(Ymm dst,Ymm x,Ymm y)706     void Assembler::vpcmpgtd(Ymm dst, Ymm x, Ymm y) { this->op(0x66,0x0f,0x66, dst,x,y); }
707 
vpblendvb(Ymm dst,Ymm x,Ymm y,Ymm z)708     void Assembler::vpblendvb(Ymm dst, Ymm x, Ymm y, Ymm z) {
709         int prefix = 0x66,
710             map    = 0x3a0f,
711             opcode = 0x4c;
712         VEX v = vex(0, dst>>3, 0, y>>3,
713                     map, x, /*ymm?*/1, prefix);
714         this->bytes(v.bytes, v.len);
715         this->byte(opcode);
716         this->byte(mod_rm(Mod::Direct, dst&7, y&7));
717         this->byte(z << 4);
718     }
719 
720     // dst = x op /opcode_ext imm
op(int prefix,int map,int opcode,int opcode_ext,Ymm dst,Ymm x,int imm)721     void Assembler::op(int prefix, int map, int opcode, int opcode_ext, Ymm dst, Ymm x, int imm) {
722         // This is a little weird, but if we pass the opcode_ext as if it were the dst register,
723         // the dst register as if x, and the x register as if y, all the bits end up where we want.
724         this->op(prefix, map, opcode, (Ymm)opcode_ext,dst,x);
725         this->byte(imm);
726     }
727 
vpslld(Ymm dst,Ymm x,int imm)728     void Assembler::vpslld(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,6, dst,x,imm); }
vpsrld(Ymm dst,Ymm x,int imm)729     void Assembler::vpsrld(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,2, dst,x,imm); }
vpsrad(Ymm dst,Ymm x,int imm)730     void Assembler::vpsrad(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x72,4, dst,x,imm); }
731 
vpsrlw(Ymm dst,Ymm x,int imm)732     void Assembler::vpsrlw(Ymm dst, Ymm x, int imm) { this->op(0x66,0x0f,0x71,2, dst,x,imm); }
733 
734 
vpermq(Ymm dst,Ymm x,int imm)735     void Assembler::vpermq(Ymm dst, Ymm x, int imm) {
736         // A bit unusual among the instructions we use, this is 64-bit operation, so we set W.
737         bool W = true;
738         this->op(0x66,0x3a0f,0x00, dst,x,W);
739         this->byte(imm);
740     }
741 
vmovdqa(Ymm dst,Ymm src)742     void Assembler::vmovdqa(Ymm dst, Ymm src) { this->op(0x66,0x0f,0x6f, dst,src); }
743 
vcvtdq2ps(Ymm dst,Ymm x)744     void Assembler::vcvtdq2ps (Ymm dst, Ymm x) { this->op(0,   0x0f,0x5b, dst,x); }
vcvttps2dq(Ymm dst,Ymm x)745     void Assembler::vcvttps2dq(Ymm dst, Ymm x) { this->op(0xf3,0x0f,0x5b, dst,x); }
746 
here()747     Assembler::Label Assembler::here() {
748         return { (int)this->size(), Label::None, {} };
749     }
750 
disp19(Label * l)751     int Assembler::disp19(Label* l) {
752         SkASSERT(l->kind == Label::None ||
753                  l->kind == Label::ARMDisp19);
754         l->kind = Label::ARMDisp19;
755         l->references.push_back(here().offset);
756         // ARM 19-bit instruction count, from the beginning of this instruction.
757         return (l->offset - here().offset) / 4;
758     }
759 
disp32(Label * l)760     int Assembler::disp32(Label* l) {
761         SkASSERT(l->kind == Label::None ||
762                  l->kind == Label::X86Disp32);
763         l->kind = Label::X86Disp32;
764         l->references.push_back(here().offset);
765         // x86 32-bit byte count, from the end of this instruction.
766         return l->offset - (here().offset + 4);
767     }
768 
op(int prefix,int map,int opcode,Ymm dst,Ymm x,Label * l)769     void Assembler::op(int prefix, int map, int opcode, Ymm dst, Ymm x, Label* l) {
770         // IP-relative addressing uses Mod::Indirect with the R/M encoded as-if rbp or r13.
771         const int rip = rbp;
772 
773         VEX v = vex(0, dst>>3, 0, rip>>3,
774                     map, x, /*ymm?*/1, prefix);
775         this->bytes(v.bytes, v.len);
776         this->byte(opcode);
777         this->byte(mod_rm(Mod::Indirect, dst&7, rip&7));
778         this->word(this->disp32(l));
779     }
780 
vpshufb(Ymm dst,Ymm x,Label * l)781     void Assembler::vpshufb(Ymm dst, Ymm x, Label* l) { this->op(0x66,0x380f,0x00, dst,x,l); }
782 
vbroadcastss(Ymm dst,Label * l)783     void Assembler::vbroadcastss(Ymm dst, Label* l) { this->op(0x66,0x380f,0x18, dst, (Ymm)0, l); }
vbroadcastss(Ymm dst,Xmm src)784     void Assembler::vbroadcastss(Ymm dst, Xmm src)  { this->op(0x66,0x380f,0x18, dst, (Ymm)src); }
vbroadcastss(Ymm dst,GP64 ptr,int off)785     void Assembler::vbroadcastss(Ymm dst, GP64 ptr, int off) {
786         int prefix = 0x66,
787                map = 0x380f,
788             opcode = 0x18;
789         VEX v = vex(0, dst>>3, 0, ptr>>3,
790                     map, 0, /*ymm?*/1, prefix);
791         this->bytes(v.bytes, v.len);
792         this->byte(opcode);
793 
794         this->byte(mod_rm(mod(off), dst&7, ptr&7));
795         this->bytes(&off, imm_bytes(mod(off)));
796     }
797 
jump(uint8_t condition,Label * l)798     void Assembler::jump(uint8_t condition, Label* l) {
799         // These conditional jumps can be either 2 bytes (short) or 6 bytes (near):
800         //    7?     one-byte-disp
801         //    0F 8? four-byte-disp
802         // We always use the near displacement to make updating labels simpler (no resizing).
803         this->byte(0x0f);
804         this->byte(condition);
805         this->word(this->disp32(l));
806     }
je(Label * l)807     void Assembler::je (Label* l) { this->jump(0x84, l); }
jne(Label * l)808     void Assembler::jne(Label* l) { this->jump(0x85, l); }
jl(Label * l)809     void Assembler::jl (Label* l) { this->jump(0x8c, l); }
810 
jmp(Label * l)811     void Assembler::jmp(Label* l) {
812         // Like above in jump(), we could use 8-bit displacement here, but always use 32-bit.
813         this->byte(0xe9);
814         this->word(this->disp32(l));
815     }
816 
load_store(int prefix,int map,int opcode,Ymm ymm,GP64 ptr)817     void Assembler::load_store(int prefix, int map, int opcode, Ymm ymm, GP64 ptr) {
818         VEX v = vex(0, ymm>>3, 0, ptr>>3,
819                     map, 0, /*ymm?*/1, prefix);
820         this->bytes(v.bytes, v.len);
821         this->byte(opcode);
822         this->byte(mod_rm(Mod::Indirect, ymm&7, ptr&7));
823     }
824 
vmovups(Ymm dst,GP64 src)825     void Assembler::vmovups  (Ymm dst, GP64 src) { this->load_store(0   ,  0x0f,0x10, dst,src); }
vpmovzxwd(Ymm dst,GP64 src)826     void Assembler::vpmovzxwd(Ymm dst, GP64 src) { this->load_store(0x66,0x380f,0x33, dst,src); }
vpmovzxbd(Ymm dst,GP64 src)827     void Assembler::vpmovzxbd(Ymm dst, GP64 src) { this->load_store(0x66,0x380f,0x31, dst,src); }
828 
vmovups(GP64 dst,Ymm src)829     void Assembler::vmovups  (GP64 dst, Ymm src) { this->load_store(0   ,  0x0f,0x11, src,dst); }
vmovups(GP64 dst,Xmm src)830     void Assembler::vmovups  (GP64 dst, Xmm src) {
831         // Same as vmovups(GP64,YMM) and load_store() except ymm? is 0.
832         int prefix = 0,
833             map    = 0x0f,
834             opcode = 0x11;
835         VEX v = vex(0, src>>3, 0, dst>>3,
836                     map, 0, /*ymm?*/0, prefix);
837         this->bytes(v.bytes, v.len);
838         this->byte(opcode);
839         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
840     }
841 
vmovq(GP64 dst,Xmm src)842     void Assembler::vmovq(GP64 dst, Xmm src) {
843         int prefix = 0x66,
844             map    = 0x0f,
845             opcode = 0xd6;
846         VEX v = vex(0, src>>3, 0, dst>>3,
847                     map, 0, /*ymm?*/0, prefix);
848         this->bytes(v.bytes, v.len);
849         this->byte(opcode);
850         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
851     }
852 
vmovd(GP64 dst,Xmm src)853     void Assembler::vmovd(GP64 dst, Xmm src) {
854         int prefix = 0x66,
855             map    = 0x0f,
856             opcode = 0x7e;
857         VEX v = vex(0, src>>3, 0, dst>>3,
858                     map, 0, /*ymm?*/0, prefix);
859         this->bytes(v.bytes, v.len);
860         this->byte(opcode);
861         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
862     }
863 
vmovd_direct(GP64 dst,Xmm src)864     void Assembler::vmovd_direct(GP64 dst, Xmm src) {
865         int prefix = 0x66,
866             map    = 0x0f,
867             opcode = 0x7e;
868         VEX v = vex(0, src>>3, 0, dst>>3,
869                     map, 0, /*ymm?*/0, prefix);
870         this->bytes(v.bytes, v.len);
871         this->byte(opcode);
872         this->byte(mod_rm(Mod::Direct, src&7, dst&7));
873     }
874 
vmovd(Xmm dst,GP64 src)875     void Assembler::vmovd(Xmm dst, GP64 src) {
876         int prefix = 0x66,
877             map    = 0x0f,
878             opcode = 0x6e;
879         VEX v = vex(0, dst>>3, 0, src>>3,
880                     map, 0, /*ymm?*/0, prefix);
881         this->bytes(v.bytes, v.len);
882         this->byte(opcode);
883         this->byte(mod_rm(Mod::Indirect, dst&7, src&7));
884     }
885 
vmovd_direct(Xmm dst,GP64 src)886     void Assembler::vmovd_direct(Xmm dst, GP64 src) {
887         int prefix = 0x66,
888             map    = 0x0f,
889             opcode = 0x6e;
890         VEX v = vex(0, dst>>3, 0, src>>3,
891                     map, 0, /*ymm?*/0, prefix);
892         this->bytes(v.bytes, v.len);
893         this->byte(opcode);
894         this->byte(mod_rm(Mod::Direct, dst&7, src&7));
895     }
896 
movzbl(GP64 dst,GP64 src,int off)897     void Assembler::movzbl(GP64 dst, GP64 src, int off) {
898         if ((dst>>3) || (src>>3)) {
899             this->byte(rex(0,dst>>3,0,src>>3));
900         }
901         this->byte(0x0f);
902         this->byte(0xb6);
903         this->byte(mod_rm(mod(off), dst&7, src&7));
904         this->bytes(&off, imm_bytes(mod(off)));
905     }
906 
907 
movb(GP64 dst,GP64 src)908     void Assembler::movb(GP64 dst, GP64 src) {
909         if ((dst>>3) || (src>>3)) {
910             this->byte(rex(0,src>>3,0,dst>>3));
911         }
912         this->byte(0x88);
913         this->byte(mod_rm(Mod::Indirect, src&7, dst&7));
914     }
915 
vpinsrw(Xmm dst,Xmm src,GP64 ptr,int imm)916     void Assembler::vpinsrw(Xmm dst, Xmm src, GP64 ptr, int imm) {
917         int prefix = 0x66,
918             map    = 0x0f,
919             opcode = 0xc4;
920         VEX v = vex(0, dst>>3, 0, ptr>>3,
921                     map, src, /*ymm?*/0, prefix);
922         this->bytes(v.bytes, v.len);
923         this->byte(opcode);
924         this->byte(mod_rm(Mod::Indirect, dst&7, ptr&7));
925         this->byte(imm);
926     }
927 
vpinsrb(Xmm dst,Xmm src,GP64 ptr,int imm)928     void Assembler::vpinsrb(Xmm dst, Xmm src, GP64 ptr, int imm) {
929         int prefix = 0x66,
930             map    = 0x3a0f,
931             opcode = 0x20;
932         VEX v = vex(0, dst>>3, 0, ptr>>3,
933                     map, src, /*ymm?*/0, prefix);
934         this->bytes(v.bytes, v.len);
935         this->byte(opcode);
936         this->byte(mod_rm(Mod::Indirect, dst&7, ptr&7));
937         this->byte(imm);
938     }
939 
vpextrw(GP64 ptr,Xmm src,int imm)940     void Assembler::vpextrw(GP64 ptr, Xmm src, int imm) {
941         int prefix = 0x66,
942             map    = 0x3a0f,
943             opcode = 0x15;
944 
945         VEX v = vex(0, src>>3, 0, ptr>>3,
946                     map, 0, /*ymm?*/0, prefix);
947         this->bytes(v.bytes, v.len);
948         this->byte(opcode);
949         this->byte(mod_rm(Mod::Indirect, src&7, ptr&7));
950         this->byte(imm);
951     }
vpextrb(GP64 ptr,Xmm src,int imm)952     void Assembler::vpextrb(GP64 ptr, Xmm src, int imm) {
953         int prefix = 0x66,
954             map    = 0x3a0f,
955             opcode = 0x14;
956 
957         VEX v = vex(0, src>>3, 0, ptr>>3,
958                     map, 0, /*ymm?*/0, prefix);
959         this->bytes(v.bytes, v.len);
960         this->byte(opcode);
961         this->byte(mod_rm(Mod::Indirect, src&7, ptr&7));
962         this->byte(imm);
963     }
964 
965     // https://static.docs.arm.com/ddi0596/a/DDI_0596_ARM_a64_instruction_set_architecture.pdf
966 
operator ""_mask(unsigned long long bits)967     static int operator"" _mask(unsigned long long bits) { return (1<<(int)bits)-1; }
968 
op(uint32_t hi,V m,uint32_t lo,V n,V d)969     void Assembler::op(uint32_t hi, V m, uint32_t lo, V n, V d) {
970         this->word( (hi & 11_mask) << 21
971                   | (m  &  5_mask) << 16
972                   | (lo &  6_mask) << 10
973                   | (n  &  5_mask) <<  5
974                   | (d  &  5_mask) <<  0);
975     }
976 
and16b(V d,V n,V m)977     void Assembler::and16b(V d, V n, V m) { this->op(0b0'1'0'01110'00'1, m, 0b00011'1, n, d); }
orr16b(V d,V n,V m)978     void Assembler::orr16b(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b00011'1, n, d); }
eor16b(V d,V n,V m)979     void Assembler::eor16b(V d, V n, V m) { this->op(0b0'1'1'01110'00'1, m, 0b00011'1, n, d); }
bic16b(V d,V n,V m)980     void Assembler::bic16b(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b00011'1, n, d); }
bsl16b(V d,V n,V m)981     void Assembler::bsl16b(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b00011'1, n, d); }
982 
add4s(V d,V n,V m)983     void Assembler::add4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10000'1, n, d); }
sub4s(V d,V n,V m)984     void Assembler::sub4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10000'1, n, d); }
mul4s(V d,V n,V m)985     void Assembler::mul4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b10011'1, n, d); }
986 
cmeq4s(V d,V n,V m)987     void Assembler::cmeq4s(V d, V n, V m) { this->op(0b0'1'1'01110'10'1, m, 0b10001'1, n, d); }
cmgt4s(V d,V n,V m)988     void Assembler::cmgt4s(V d, V n, V m) { this->op(0b0'1'0'01110'10'1, m, 0b0011'0'1, n, d); }
989 
sub8h(V d,V n,V m)990     void Assembler::sub8h(V d, V n, V m) { this->op(0b0'1'1'01110'01'1, m, 0b10000'1, n, d); }
mul8h(V d,V n,V m)991     void Assembler::mul8h(V d, V n, V m) { this->op(0b0'1'0'01110'01'1, m, 0b10011'1, n, d); }
992 
fadd4s(V d,V n,V m)993     void Assembler::fadd4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11010'1, n, d); }
fsub4s(V d,V n,V m)994     void Assembler::fsub4s(V d, V n, V m) { this->op(0b0'1'0'01110'1'0'1, m, 0b11010'1, n, d); }
fmul4s(V d,V n,V m)995     void Assembler::fmul4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11011'1, n, d); }
fdiv4s(V d,V n,V m)996     void Assembler::fdiv4s(V d, V n, V m) { this->op(0b0'1'1'01110'0'0'1, m, 0b11111'1, n, d); }
997 
fmla4s(V d,V n,V m)998     void Assembler::fmla4s(V d, V n, V m) { this->op(0b0'1'0'01110'0'0'1, m, 0b11001'1, n, d); }
999 
tbl(V d,V n,V m)1000     void Assembler::tbl(V d, V n, V m) { this->op(0b0'1'001110'00'0, m, 0b0'00'0'00, n, d); }
1001 
op(uint32_t op22,int imm,V n,V d)1002     void Assembler::op(uint32_t op22, int imm, V n, V d) {
1003         this->word( (op22 & 22_mask) << 10
1004                   | imm              << 16   // imm is embedded inside op, bit size depends on op
1005                   | (n    &  5_mask) <<  5
1006                   | (d    &  5_mask) <<  0);
1007     }
1008 
sli4s(V d,V n,int imm)1009     void Assembler::sli4s(V d, V n, int imm) {
1010         this->op(0b0'1'1'011110'0100'000'01010'1,    ( imm&31), n, d);
1011     }
shl4s(V d,V n,int imm)1012     void Assembler::shl4s(V d, V n, int imm) {
1013         this->op(0b0'1'0'011110'0100'000'01010'1,    ( imm&31), n, d);
1014     }
sshr4s(V d,V n,int imm)1015     void Assembler::sshr4s(V d, V n, int imm) {
1016         this->op(0b0'1'0'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
1017     }
ushr4s(V d,V n,int imm)1018     void Assembler::ushr4s(V d, V n, int imm) {
1019         this->op(0b0'1'1'011110'0100'000'00'0'0'0'1, (-imm&31), n, d);
1020     }
ushr8h(V d,V n,int imm)1021     void Assembler::ushr8h(V d, V n, int imm) {
1022         this->op(0b0'1'1'011110'0010'000'00'0'0'0'1, (-imm&15), n, d);
1023     }
1024 
scvtf4s(V d,V n)1025     void Assembler::scvtf4s (V d, V n) { this->op(0b0'1'0'01110'0'0'10000'11101'10, n,d); }
fcvtzs4s(V d,V n)1026     void Assembler::fcvtzs4s(V d, V n) { this->op(0b0'1'0'01110'1'0'10000'1101'1'10, n,d); }
1027 
xtns2h(V d,V n)1028     void Assembler::xtns2h(V d, V n) { this->op(0b0'0'0'01110'01'10000'10010'10, n,d); }
xtnh2b(V d,V n)1029     void Assembler::xtnh2b(V d, V n) { this->op(0b0'0'0'01110'00'10000'10010'10, n,d); }
1030 
uxtlb2h(V d,V n)1031     void Assembler::uxtlb2h(V d, V n) { this->op(0b0'0'1'011110'0001'000'10100'1, n,d); }
uxtlh2s(V d,V n)1032     void Assembler::uxtlh2s(V d, V n) { this->op(0b0'0'1'011110'0010'000'10100'1, n,d); }
1033 
ret(X n)1034     void Assembler::ret(X n) {
1035         this->word(0b1101011'0'0'10'11111'0000'0'0 << 10
1036                   | (n & 5_mask) << 5);
1037     }
1038 
add(X d,X n,int imm12)1039     void Assembler::add(X d, X n, int imm12) {
1040         this->word(0b1'0'0'10001'00   << 22
1041                   | (imm12 & 12_mask) << 10
1042                   | (n     &  5_mask) <<  5
1043                   | (d     &  5_mask) <<  0);
1044     }
sub(X d,X n,int imm12)1045     void Assembler::sub(X d, X n, int imm12) {
1046         this->word( 0b1'1'0'10001'00  << 22
1047                   | (imm12 & 12_mask) << 10
1048                   | (n     &  5_mask) <<  5
1049                   | (d     &  5_mask) <<  0);
1050     }
subs(X d,X n,int imm12)1051     void Assembler::subs(X d, X n, int imm12) {
1052         this->word( 0b1'1'1'10001'00  << 22
1053                   | (imm12 & 12_mask) << 10
1054                   | (n     &  5_mask) <<  5
1055                   | (d     &  5_mask) <<  0);
1056     }
1057 
b(Condition cond,Label * l)1058     void Assembler::b(Condition cond, Label* l) {
1059         const int imm19 = this->disp19(l);
1060         this->word( 0b0101010'0           << 24
1061                   | (imm19     & 19_mask) <<  5
1062                   | ((int)cond &  4_mask) <<  0);
1063     }
cbz(X t,Label * l)1064     void Assembler::cbz(X t, Label* l) {
1065         const int imm19 = this->disp19(l);
1066         this->word( 0b1'011010'0      << 24
1067                   | (imm19 & 19_mask) <<  5
1068                   | (t     &  5_mask) <<  0);
1069     }
cbnz(X t,Label * l)1070     void Assembler::cbnz(X t, Label* l) {
1071         const int imm19 = this->disp19(l);
1072         this->word( 0b1'011010'1      << 24
1073                   | (imm19 & 19_mask) <<  5
1074                   | (t     &  5_mask) <<  0);
1075     }
1076 
ldrq(V dst,X src)1077     void Assembler::ldrq(V dst, X src) { this->op(0b00'111'1'01'11'000000000000, src, dst); }
ldrs(V dst,X src)1078     void Assembler::ldrs(V dst, X src) { this->op(0b10'111'1'01'01'000000000000, src, dst); }
ldrb(V dst,X src)1079     void Assembler::ldrb(V dst, X src) { this->op(0b00'111'1'01'01'000000000000, src, dst); }
1080 
strq(V src,X dst)1081     void Assembler::strq(V src, X dst) { this->op(0b00'111'1'01'10'000000000000, dst, src); }
strs(V src,X dst)1082     void Assembler::strs(V src, X dst) { this->op(0b10'111'1'01'00'000000000000, dst, src); }
strb(V src,X dst)1083     void Assembler::strb(V src, X dst) { this->op(0b00'111'1'01'00'000000000000, dst, src); }
1084 
ldrq(V dst,Label * l)1085     void Assembler::ldrq(V dst, Label* l) {
1086         const int imm19 = this->disp19(l);
1087         this->word( 0b10'011'1'00     << 24
1088                   | (imm19 & 19_mask) << 5
1089                   | (dst   &  5_mask) << 0);
1090     }
1091 
label(Label * l)1092     void Assembler::label(Label* l) {
1093         if (fCode) {
1094             // The instructions all currently point to l->offset.
1095             // We'll want to add a delta to point them to here().
1096             int delta = here().offset - l->offset;
1097             l->offset = here().offset;
1098 
1099             if (l->kind == Label::ARMDisp19) {
1100                 for (int ref : l->references) {
1101                     // ref points to a 32-bit instruction with 19-bit displacement in instructions.
1102                     uint32_t inst;
1103                     memcpy(&inst, fCode + ref, 4);
1104 
1105                     // [ 8 bits to preserve] [ 19 bit signed displacement ] [ 5 bits to preserve ]
1106                     int disp = (int)(inst << 8) >> 13;
1107 
1108                     disp += delta/4;  // delta is in bytes, we want instructions.
1109 
1110                     // Put it all back together, preserving the high 8 bits and low 5.
1111                     inst = ((disp << 5) &  (19_mask << 5))
1112                          | ((inst     ) & ~(19_mask << 5));
1113 
1114                     memcpy(fCode + ref, &inst, 4);
1115                 }
1116             }
1117 
1118             if (l->kind == Label::X86Disp32) {
1119                 for (int ref : l->references) {
1120                     // ref points to a 32-bit displacement in bytes.
1121                     int disp;
1122                     memcpy(&disp, fCode + ref, 4);
1123 
1124                     disp += delta;
1125 
1126                     memcpy(fCode + ref, &disp, 4);
1127                 }
1128             }
1129         }
1130     }
1131 
eval(int n,void * args[]) const1132     void Program::eval(int n, void* args[]) const {
1133         const int nargs = (int)fStrides.size();
1134 
1135         if (fJITBuf) {
1136             void** a = args;
1137             const void* b = fJITBuf;
1138             switch (nargs) {
1139                 case 0: return ((void(*)(int                        ))b)(n                    );
1140                 case 1: return ((void(*)(int,void*                  ))b)(n,a[0]               );
1141                 case 2: return ((void(*)(int,void*,void*            ))b)(n,a[0],a[1]          );
1142                 case 3: return ((void(*)(int,void*,void*,void*      ))b)(n,a[0],a[1],a[2]     );
1143                 case 4: return ((void(*)(int,void*,void*,void*,void*))b)(n,a[0],a[1],a[2],a[3]);
1144                 default: SkUNREACHABLE;  // TODO
1145             }
1146         }
1147 
1148         // We'll operate in SIMT style, knocking off K-size chunks from n while possible.
1149         constexpr int K = 16;
1150         using I32 = skvx::Vec<K, int>;
1151         using F32 = skvx::Vec<K, float>;
1152         using U32 = skvx::Vec<K, uint32_t>;
1153         using U16 = skvx::Vec<K, uint16_t>;
1154         using  U8 = skvx::Vec<K, uint8_t>;
1155 
1156         using I16x2 = skvx::Vec<2*K,  int16_t>;
1157         using U16x2 = skvx::Vec<2*K, uint16_t>;
1158 
1159         union Slot {
1160             F32   f32;
1161             I32   i32;
1162             U32   u32;
1163             I16x2 i16x2;
1164             U16x2 u16x2;
1165         };
1166 
1167         Slot                     few_regs[16];
1168         std::unique_ptr<char[]> many_regs;
1169 
1170         Slot* regs = few_regs;
1171 
1172         if (fRegs > (int)SK_ARRAY_COUNT(few_regs)) {
1173             // Annoyingly we can't trust that malloc() or new will work with Slot because
1174             // the skvx::Vec types may have alignment greater than what they provide.
1175             // We'll overallocate one extra register so we can align manually.
1176             many_regs.reset(new char[ sizeof(Slot) * (fRegs + 1) ]);
1177 
1178             uintptr_t addr = (uintptr_t)many_regs.get();
1179             addr += alignof(Slot) -
1180                      (addr & (alignof(Slot) - 1));
1181             SkASSERT((addr & (alignof(Slot) - 1)) == 0);
1182             regs = (Slot*)addr;
1183         }
1184 
1185 
1186         auto r = [&](Reg id) -> Slot& {
1187             SkASSERT(0 <= id && id < fRegs);
1188             return regs[id];
1189         };
1190         auto arg = [&](int ix) {
1191             SkASSERT(0 <= ix && ix < nargs);
1192             return args[ix];
1193         };
1194 
1195         // Step each argument pointer ahead by its stride a number of times.
1196         auto step_args = [&](int times) {
1197             for (int i = 0; i < (int)fStrides.size(); i++) {
1198                 args[i] = (void*)( (char*)args[i] + times * fStrides[i] );
1199             }
1200         };
1201 
1202         int start = 0,
1203             stride;
1204         for ( ; n > 0; start = fLoop, n -= stride, step_args(stride)) {
1205             stride = n >= K ? K : 1;
1206 
1207             for (int i = start; i < (int)fInstructions.size(); i++) {
1208                 Instruction inst = fInstructions[i];
1209 
1210                 // d = op(x,y,z/imm)
1211                 Reg   d = inst.d,
1212                       x = inst.x,
1213                       y = inst.y,
1214                       z = inst.z;
1215                 int imm = inst.imm;
1216 
1217                 // Ops that interact with memory need to know whether we're stride=1 or K,
1218                 // but all non-memory ops can run the same code no matter the stride.
1219                 switch (2*(int)inst.op + (stride == K ? 1 : 0)) {
1220                     default: SkUNREACHABLE;
1221 
1222                 #define STRIDE_1(op) case 2*(int)op
1223                 #define STRIDE_K(op) case 2*(int)op + 1
1224                     STRIDE_1(Op::store8 ): memcpy(arg(imm), &r(x).i32, 1); break;
1225                     STRIDE_1(Op::store16): memcpy(arg(imm), &r(x).i32, 2); break;
1226                     STRIDE_1(Op::store32): memcpy(arg(imm), &r(x).i32, 4); break;
1227 
1228                     STRIDE_K(Op::store8 ): skvx::cast<uint8_t> (r(x).i32).store(arg(imm)); break;
1229                     STRIDE_K(Op::store16): skvx::cast<uint16_t>(r(x).i32).store(arg(imm)); break;
1230                     STRIDE_K(Op::store32):                     (r(x).i32).store(arg(imm)); break;
1231 
1232                     STRIDE_1(Op::load8 ): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 1); break;
1233                     STRIDE_1(Op::load16): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 2); break;
1234                     STRIDE_1(Op::load32): r(d).i32 = 0; memcpy(&r(d).i32, arg(imm), 4); break;
1235 
1236                     STRIDE_K(Op::load8 ): r(d).i32= skvx::cast<int>(U8 ::Load(arg(imm))); break;
1237                     STRIDE_K(Op::load16): r(d).i32= skvx::cast<int>(U16::Load(arg(imm))); break;
1238                     STRIDE_K(Op::load32): r(d).i32=                 I32::Load(arg(imm)) ; break;
1239 
1240                     STRIDE_1(Op::gather8):
1241                         for (int i = 0; i < K; i++) {
1242                             r(d).i32[i] = (i == 0) ? ((const uint8_t* )arg(imm))[ r(x).i32[i] ] : 0;
1243                         } break;
1244                     STRIDE_1(Op::gather16):
1245                         for (int i = 0; i < K; i++) {
1246                             r(d).i32[i] = (i == 0) ? ((const uint16_t*)arg(imm))[ r(x).i32[i] ] : 0;
1247                         } break;
1248                     STRIDE_1(Op::gather32):
1249                         for (int i = 0; i < K; i++) {
1250                             r(d).i32[i] = (i == 0) ? ((const int*     )arg(imm))[ r(x).i32[i] ] : 0;
1251                         } break;
1252 
1253                     STRIDE_K(Op::gather8):
1254                         for (int i = 0; i < K; i++) {
1255                             r(d).i32[i] = ((const uint8_t* )arg(imm))[ r(x).i32[i] ];
1256                         } break;
1257                     STRIDE_K(Op::gather16):
1258                         for (int i = 0; i < K; i++) {
1259                             r(d).i32[i] = ((const uint16_t*)arg(imm))[ r(x).i32[i] ];
1260                         } break;
1261                     STRIDE_K(Op::gather32):
1262                         for (int i = 0; i < K; i++) {
1263                             r(d).i32[i] = ((const int*     )arg(imm))[ r(x).i32[i] ];
1264                         } break;
1265 
1266                 #undef STRIDE_1
1267                 #undef STRIDE_K
1268 
1269                     // Ops that don't interact with memory should never care about the stride.
1270                 #define CASE(op) case 2*(int)op: /*fallthrough*/ case 2*(int)op+1
1271 
1272                     CASE(Op::uniform8):
1273                         r(d).i32 = *(const uint8_t* )( (const char*)arg(imm&0xffff) + (imm>>16) );
1274                         break;
1275                     CASE(Op::uniform16):
1276                         r(d).i32 = *(const uint16_t*)( (const char*)arg(imm&0xffff) + (imm>>16) );
1277                         break;
1278                     CASE(Op::uniform32):
1279                         r(d).i32 = *(const int*     )( (const char*)arg(imm&0xffff) + (imm>>16) );
1280                         break;
1281 
1282                     CASE(Op::splat): r(d).i32 = imm; break;
1283 
1284                     CASE(Op::add_f32): r(d).f32 = r(x).f32 + r(y).f32; break;
1285                     CASE(Op::sub_f32): r(d).f32 = r(x).f32 - r(y).f32; break;
1286                     CASE(Op::mul_f32): r(d).f32 = r(x).f32 * r(y).f32; break;
1287                     CASE(Op::div_f32): r(d).f32 = r(x).f32 / r(y).f32; break;
1288 
1289                     CASE(Op::mad_f32): r(d).f32 = r(x).f32 * r(y).f32 + r(z).f32; break;
1290 
1291                     CASE(Op::add_i32): r(d).i32 = r(x).i32 + r(y).i32; break;
1292                     CASE(Op::sub_i32): r(d).i32 = r(x).i32 - r(y).i32; break;
1293                     CASE(Op::mul_i32): r(d).i32 = r(x).i32 * r(y).i32; break;
1294 
1295                     CASE(Op::add_i16x2): r(d).i16x2 = r(x).i16x2 + r(y).i16x2; break;
1296                     CASE(Op::sub_i16x2): r(d).i16x2 = r(x).i16x2 - r(y).i16x2; break;
1297                     CASE(Op::mul_i16x2): r(d).i16x2 = r(x).i16x2 * r(y).i16x2; break;
1298 
1299                     CASE(Op::shl_i32): r(d).i32 = r(x).i32 << imm; break;
1300                     CASE(Op::sra_i32): r(d).i32 = r(x).i32 >> imm; break;
1301                     CASE(Op::shr_i32): r(d).u32 = r(x).u32 >> imm; break;
1302 
1303                     CASE(Op::shl_i16x2): r(d).i16x2 = r(x).i16x2 << imm; break;
1304                     CASE(Op::sra_i16x2): r(d).i16x2 = r(x).i16x2 >> imm; break;
1305                     CASE(Op::shr_i16x2): r(d).u16x2 = r(x).u16x2 >> imm; break;
1306 
1307                     CASE(Op:: eq_f32): r(d).i32 = r(x).f32 == r(y).f32; break;
1308                     CASE(Op::neq_f32): r(d).i32 = r(x).f32 != r(y).f32; break;
1309                     CASE(Op:: lt_f32): r(d).i32 = r(x).f32 <  r(y).f32; break;
1310                     CASE(Op::lte_f32): r(d).i32 = r(x).f32 <= r(y).f32; break;
1311                     CASE(Op:: gt_f32): r(d).i32 = r(x).f32 >  r(y).f32; break;
1312                     CASE(Op::gte_f32): r(d).i32 = r(x).f32 >= r(y).f32; break;
1313 
1314                     CASE(Op:: eq_i32): r(d).i32 = r(x).i32 == r(y).i32; break;
1315                     CASE(Op::neq_i32): r(d).i32 = r(x).i32 != r(y).i32; break;
1316                     CASE(Op:: lt_i32): r(d).i32 = r(x).i32 <  r(y).i32; break;
1317                     CASE(Op::lte_i32): r(d).i32 = r(x).i32 <= r(y).i32; break;
1318                     CASE(Op:: gt_i32): r(d).i32 = r(x).i32 >  r(y).i32; break;
1319                     CASE(Op::gte_i32): r(d).i32 = r(x).i32 >= r(y).i32; break;
1320 
1321                     CASE(Op:: eq_i16x2): r(d).i16x2 = r(x).i16x2 == r(y).i16x2; break;
1322                     CASE(Op::neq_i16x2): r(d).i16x2 = r(x).i16x2 != r(y).i16x2; break;
1323                     CASE(Op:: lt_i16x2): r(d).i16x2 = r(x).i16x2 <  r(y).i16x2; break;
1324                     CASE(Op::lte_i16x2): r(d).i16x2 = r(x).i16x2 <= r(y).i16x2; break;
1325                     CASE(Op:: gt_i16x2): r(d).i16x2 = r(x).i16x2 >  r(y).i16x2; break;
1326                     CASE(Op::gte_i16x2): r(d).i16x2 = r(x).i16x2 >= r(y).i16x2; break;
1327 
1328                     CASE(Op::bit_and  ): r(d).i32 = r(x).i32 &  r(y).i32; break;
1329                     CASE(Op::bit_or   ): r(d).i32 = r(x).i32 |  r(y).i32; break;
1330                     CASE(Op::bit_xor  ): r(d).i32 = r(x).i32 ^  r(y).i32; break;
1331                     CASE(Op::bit_clear): r(d).i32 = r(x).i32 & ~r(y).i32; break;
1332 
1333                     CASE(Op::select): r(d).i32 = skvx::if_then_else(r(x).i32, r(y).i32, r(z).i32);
1334                                       break;
1335 
1336 
1337                     CASE(Op::extract): r(d).u32 = (r(x).u32 >> imm) & r(y).u32; break;
1338                     CASE(Op::pack):    r(d).u32 = r(x).u32 | (r(y).u32 << imm); break;
1339 
1340                     CASE(Op::bytes): {
1341                         const U32 table[] = {
1342                             0,
1343                             (r(x).u32      ) & 0xff,
1344                             (r(x).u32 >>  8) & 0xff,
1345                             (r(x).u32 >> 16) & 0xff,
1346                             (r(x).u32 >> 24) & 0xff,
1347                         };
1348                         r(d).u32 = table[(imm >>  0) & 0xf] <<  0
1349                                  | table[(imm >>  4) & 0xf] <<  8
1350                                  | table[(imm >>  8) & 0xf] << 16
1351                                  | table[(imm >> 12) & 0xf] << 24;
1352                     } break;
1353 
1354                     CASE(Op::to_f32): r(d).f32 = skvx::cast<float>(r(x).i32); break;
1355                     CASE(Op::to_i32): r(d).i32 = skvx::cast<int>  (r(x).f32); break;
1356                 #undef CASE
1357                 }
1358             }
1359         }
1360     }
1361 
hasJIT() const1362     bool Program::hasJIT() const {
1363         return fJITBuf != nullptr;
1364     }
1365 
dropJIT()1366     void Program::dropJIT() {
1367     #if defined(SKVM_JIT)
1368         if (fJITBuf) {
1369             munmap(fJITBuf, fJITSize);
1370         }
1371     #else
1372         SkASSERT(!this->hasJIT());
1373     #endif
1374 
1375         fJITBuf   = nullptr;
1376         fJITSize  = 0;
1377     }
1378 
~Program()1379     Program::~Program() { this->dropJIT(); }
1380 
Program(Program && other)1381     Program::Program(Program&& other) {
1382         fInstructions    = std::move(other.fInstructions);
1383         fRegs            = other.fRegs;
1384         fLoop            = other.fLoop;
1385         fStrides         = std::move(other.fStrides);
1386         fOriginalProgram = std::move(other.fOriginalProgram);
1387 
1388         std::swap(fJITBuf  , other.fJITBuf);
1389         std::swap(fJITSize , other.fJITSize);
1390     }
1391 
operator =(Program && other)1392     Program& Program::operator=(Program&& other) {
1393         fInstructions    = std::move(other.fInstructions);
1394         fRegs            = other.fRegs;
1395         fLoop            = other.fLoop;
1396         fStrides         = std::move(other.fStrides);
1397         fOriginalProgram = std::move(other.fOriginalProgram);
1398 
1399         std::swap(fJITBuf  , other.fJITBuf);
1400         std::swap(fJITSize , other.fJITSize);
1401         return *this;
1402     }
1403 
Program()1404     Program::Program() {}
1405 
Program(const std::vector<Builder::Instruction> & instructions,const std::vector<int> & strides,const char * debug_name)1406     Program::Program(const std::vector<Builder::Instruction>& instructions,
1407                      const std::vector<int>& strides,
1408                      const char* debug_name)
1409         : fStrides(strides)
1410         , fOriginalProgram(instructions)
1411     {
1412         this->setupInterpreter(instructions);
1413     #if defined(SKVM_JIT)
1414         this->setupJIT(instructions, debug_name);
1415     #endif
1416     }
1417 
1418     // Translate Builder::Instructions to Program::Instructions used by the interpreter.
setupInterpreter(const std::vector<Builder::Instruction> & instructions)1419     void Program::setupInterpreter(const std::vector<Builder::Instruction>& instructions) {
1420         // Register each instruction is assigned to.
1421         std::vector<Reg> reg(instructions.size());
1422 
1423         // This next bit is a bit more complicated than strictly necessary;
1424         // we could just assign every live instruction to its own register.
1425         //
1426         // But recycling registers is fairly cheap, and good practice for the
1427         // JITs where minimizing register pressure really is important.
1428         //
1429         // Since we have effectively infinite registers, we hoist any value we can.
1430         // (The JIT may choose a more complex policy to reduce register pressure.)
1431         auto hoisted = [&](Val id) { return instructions[id].can_hoist; };
1432 
1433         fRegs = 0;
1434         int live_instructions = 0;
1435         std::vector<Reg> avail;
1436 
1437         // Assign this value to a register, recycling them where we can.
1438         auto assign_register = [&](Val id) {
1439             live_instructions++;
1440             const Builder::Instruction& inst = instructions[id];
1441 
1442             // If this is a real input and it's lifetime ends at this instruction,
1443             // we can recycle the register it's occupying.
1444             auto maybe_recycle_register = [&](Val input) {
1445                 if (input != NA
1446                         && instructions[input].death == id
1447                         && !(hoisted(input) && instructions[input].used_in_loop)) {
1448                     avail.push_back(reg[input]);
1449                 }
1450             };
1451 
1452             // Take care to not recycle the same register twice.
1453             if (true                                ) { maybe_recycle_register(inst.x); }
1454             if (inst.y != inst.x                    ) { maybe_recycle_register(inst.y); }
1455             if (inst.z != inst.x && inst.z != inst.y) { maybe_recycle_register(inst.z); }
1456 
1457             // Allocate a register if we have to, preferring to reuse anything available.
1458             if (avail.empty()) {
1459                 reg[id] = fRegs++;
1460             } else {
1461                 reg[id] = avail.back();
1462                 avail.pop_back();
1463             }
1464         };
1465 
1466         // Assign a register to each live hoisted instruction.
1467         for (Val id = 0; id < (Val)instructions.size(); id++) {
1468             if (instructions[id].death != 0 && hoisted(id)) {
1469                 assign_register(id);
1470             }
1471         }
1472 
1473         // Assign registers to each live loop instruction.
1474         for (Val id = 0; id < (Val)instructions.size(); id++) {
1475             if (instructions[id].death != 0 && !hoisted(id)) {
1476                 assign_register(id);
1477 
1478             }
1479         }
1480 
1481         // Translate Builder::Instructions to Program::Instructions by mapping values to
1482         // registers.  This will be two passes, first hoisted instructions, then inside the loop.
1483 
1484         // The loop begins at the fLoop'th Instruction.
1485         fLoop = 0;
1486         fInstructions.reserve(live_instructions);
1487 
1488         // Add a dummy mapping for the N/A sentinel Val to any arbitrary register
1489         // so lookups don't have to know which arguments are used by which Ops.
1490         auto lookup_register = [&](Val id) {
1491             return id == NA ? (Reg)0
1492                             : reg[id];
1493         };
1494 
1495         auto push_instruction = [&](Val id, const Builder::Instruction& inst) {
1496             Program::Instruction pinst{
1497                 inst.op,
1498                 lookup_register(id),
1499                 lookup_register(inst.x),
1500                 lookup_register(inst.y),
1501                {lookup_register(inst.z)},
1502             };
1503             if (inst.z == NA) { pinst.imm = inst.imm; }
1504             fInstructions.push_back(pinst);
1505         };
1506 
1507         for (Val id = 0; id < (Val)instructions.size(); id++) {
1508             const Builder::Instruction& inst = instructions[id];
1509             if (inst.death != 0 && hoisted(id)) {
1510                 push_instruction(id, inst);
1511                 fLoop++;
1512             }
1513         }
1514         for (Val id = 0; id < (Val)instructions.size(); id++) {
1515             const Builder::Instruction& inst = instructions[id];
1516             if (inst.death != 0 && !hoisted(id)) {
1517                 push_instruction(id, inst);
1518             }
1519         }
1520     }
1521 
1522 #if defined(SKVM_JIT)
1523 
1524     // Just so happens that we can translate the immediate control for our bytes() op
1525     // to a single 128-bit mask that can be consumed by both AVX2 vpshufb and NEON tbl!
bytes_control(int imm,int mask[4])1526     static void bytes_control(int imm, int mask[4]) {
1527         auto nibble_to_vpshufb = [](uint8_t n) -> uint8_t {
1528             // 0 -> 0xff,    Fill with zero
1529             // 1 -> 0x00,    Select byte 0
1530             // 2 -> 0x01,         "      1
1531             // 3 -> 0x02,         "      2
1532             // 4 -> 0x03,         "      3
1533             return n - 1;
1534         };
1535         uint8_t control[] = {
1536             nibble_to_vpshufb( (imm >>  0) & 0xf ),
1537             nibble_to_vpshufb( (imm >>  4) & 0xf ),
1538             nibble_to_vpshufb( (imm >>  8) & 0xf ),
1539             nibble_to_vpshufb( (imm >> 12) & 0xf ),
1540         };
1541         for (int i = 0; i < 4; i++) {
1542             mask[i] = (int)control[0] <<  0
1543                     | (int)control[1] <<  8
1544                     | (int)control[2] << 16
1545                     | (int)control[3] << 24;
1546 
1547             // Update each byte that refers to a byte index by 4 to
1548             // point into the next 32-bit lane, but leave any 0xff
1549             // that fills with zero alone.
1550             control[0] += control[0] == 0xff ? 0 : 4;
1551             control[1] += control[1] == 0xff ? 0 : 4;
1552             control[2] += control[2] == 0xff ? 0 : 4;
1553             control[3] += control[3] == 0xff ? 0 : 4;
1554         }
1555     }
1556 
jit(const std::vector<Builder::Instruction> & instructions,const bool try_hoisting,Assembler * a) const1557     bool Program::jit(const std::vector<Builder::Instruction>& instructions,
1558                       const bool try_hoisting,
1559                       Assembler* a) const {
1560         using A = Assembler;
1561 
1562         auto debug_dump = [&] {
1563         #if 0
1564             SkDebugfStream stream;
1565             this->dump(&stream);
1566             dump_builder_program(fOriginalProgram, &stream);
1567             return true;
1568         #else
1569             return false;
1570         #endif
1571         };
1572 
1573     #if defined(__x86_64__)
1574         if (!SkCpu::Supports(SkCpu::HSW)) {
1575             return false;
1576         }
1577         A::GP64 N     = A::rdi,
1578                 arg[] = { A::rsi, A::rdx, A::rcx, A::r8, A::r9 };
1579 
1580         // All 16 ymm registers are available to use.
1581         using Reg = A::Ymm;
1582         uint32_t avail = 0xffff;
1583 
1584     #elif defined(__aarch64__)
1585         A::X N     = A::x0,
1586              arg[] = { A::x1, A::x2, A::x3, A::x4, A::x5, A::x6, A::x7 };
1587 
1588         // We can use v0-v7 and v16-v31 freely; we'd need to preserve v8-v15.
1589         using Reg = A::V;
1590         uint32_t avail = 0xffff00ff;
1591     #endif
1592 
1593         if (SK_ARRAY_COUNT(arg) < fStrides.size()) {
1594             return false;
1595         }
1596 
1597         auto hoisted = [&](Val id) { return try_hoisting && instructions[id].can_hoist; };
1598 
1599         std::vector<Reg> r(instructions.size());
1600 
1601         struct LabelAndReg {
1602             A::Label label;
1603             Reg      reg;
1604         };
1605         SkTHashMap<int, LabelAndReg> splats,
1606                                      bytes_masks;
1607 
1608         auto warmup = [&](Val id) {
1609             const Builder::Instruction& inst = instructions[id];
1610             if (inst.death == 0) {
1611                 return true;
1612             }
1613 
1614             Op op = inst.op;
1615             int imm = inst.imm;
1616 
1617             switch (op) {
1618                 default: break;
1619 
1620                 case Op::splat: if (!splats.find(imm)) { splats.set(imm, {}); }
1621                                 break;
1622 
1623                 case Op::bytes: if (!bytes_masks.find(imm)) {
1624                                     bytes_masks.set(imm, {});
1625                                     if (try_hoisting) {
1626                                         // vpshufb can always work with the mask from memory,
1627                                         // but it helps to hoist the mask to a register for tbl.
1628                                     #if defined(__aarch64__)
1629                                         LabelAndReg* entry = bytes_masks.find(imm);
1630                                         if (int found = __builtin_ffs(avail)) {
1631                                             entry->reg = (Reg)(found-1);
1632                                             avail ^= 1 << entry->reg;
1633                                             a->ldrq(entry->reg, &entry->label);
1634                                         } else {
1635                                             return false;
1636                                         }
1637                                     #endif
1638                                     }
1639                                 }
1640                                 break;
1641             }
1642             return true;
1643         };
1644 
1645         auto emit = [&](Val id, bool scalar) {
1646             const Builder::Instruction& inst = instructions[id];
1647 
1648             // No need to emit dead code instructions that produce values that are never used.
1649             if (inst.death == 0) {
1650                 return true;
1651             }
1652 
1653             Op op = inst.op;
1654             Val x = inst.x,
1655                 y = inst.y,
1656                 z = inst.z;
1657             int imm = inst.imm;
1658 
1659             // Most (but not all) ops create an output value and need a register to hold it, dst.
1660             // We track each instruction's dst in r[] so we can thread it through as an input
1661             // to any future instructions needing that value.
1662             //
1663             // And some ops may need a temporary scratch register, tmp.  Some need both tmp and dst.
1664             //
1665             // tmp and dst are very similar and can and will often be assigned the same register,
1666             // but tmp may never alias any of the instructions's inputs, while dst may when this
1667             // instruction consumes that input, i.e. if the input reaches its end of life here.
1668             //
1669             // We'll assign both registers lazily to keep register pressure as low as possible.
1670             bool tmp_is_set = false,
1671                  dst_is_set = false;
1672             Reg tmp_reg = (Reg)0;  // This initial value won't matter... anything legal is fine.
1673 
1674             bool ok = true;   // Set to false if we need to assign a register and none's available.
1675 
1676             // First lock in how to choose tmp if we need to based on the registers
1677             // available before this instruction, not including any of its input registers.
1678             auto tmp = [&,avail/*important, closing over avail's current value*/]{
1679                 if (!tmp_is_set) {
1680                     tmp_is_set = true;
1681                     if (int found = __builtin_ffs(avail)) {
1682                         // This is a scratch register just for this op,
1683                         // so we leave it marked available for future ops.
1684                         tmp_reg = (Reg)(found - 1);
1685                     } else {
1686                         // We needed a tmp register but couldn't find one available. :'(
1687                         // This will cause emit() to return false, in turn causing jit() to fail.
1688                         if (debug_dump()) {
1689                             SkDebugf("\nCould not find a register to hold tmp\n");
1690                         }
1691                         ok = false;
1692                     }
1693                 }
1694                 return tmp_reg;
1695             };
1696 
1697             // Now make available any registers that are consumed by this instruction.
1698             // (The register pool we can pick dst from is >= the pool for tmp, adding any of these.)
1699             auto maybe_recycle_register = [&](Val input) {
1700                 if (input != NA
1701                         && instructions[input].death == id
1702                         && !(hoisted(input) && instructions[input].used_in_loop)) {
1703                     avail |= 1 << r[input];
1704                 }
1705             };
1706             maybe_recycle_register(x);
1707             maybe_recycle_register(y);
1708             maybe_recycle_register(z);
1709             // set_dst() and dst() will work read/write with this perhaps-just-updated avail.
1710 
1711             // Some ops may decide dst on their own to best fit the instruction (see Op::mad_f32).
1712             auto set_dst = [&](Reg reg){
1713                 SkASSERT(dst_is_set == false);
1714                 dst_is_set = true;
1715 
1716                 SkASSERT(avail & (1<<reg));
1717                 avail ^= 1<<reg;
1718 
1719                 r[id] = reg;
1720             };
1721 
1722             // Thanks to AVX and NEON's 3-argument instruction sets,
1723             // most ops can use any register as dst.
1724             auto dst = [&]{
1725                 if (!dst_is_set) {
1726                     if (int found = __builtin_ffs(avail)) {
1727                         set_dst((Reg)(found-1));
1728                     } else {
1729                         // Same deal as with tmp... all the registers are occupied.  Time to fail!
1730                         if (debug_dump()) {
1731                             SkDebugf("\nCould not find a register to hold value %d\n", id);
1732                         }
1733                         ok = false;
1734                     }
1735                 }
1736                 return r[id];
1737             };
1738 
1739             // Because we use the same logic to pick an arbitrary dst and to pick tmp,
1740             // and we know that tmp will never overlap any of the inputs, `dst() == tmp()`
1741             // is a simple idiom to check that the destination does not overlap any of the inputs.
1742             // Sometimes we can use this knowledge to do better instruction selection.
1743 
1744             // Ok!  Keep in mind that we haven't assigned tmp or dst yet,
1745             // just laid out hooks for how to do so if we need them, depending on the instruction.
1746             //
1747             // Now let's actually assemble the instruction!
1748             switch (op) {
1749                 default:
1750                     if (debug_dump()) {
1751                         SkDEBUGFAILF("\n%d not yet implemented\n", op);
1752                     }
1753                     return false;  // TODO: many new ops
1754 
1755             #if defined(__x86_64__)
1756                 case Op::store8: if (scalar) { a->vpextrb  (arg[imm], (A::Xmm)r[x], 0); }
1757                                  else        { a->vpackusdw(tmp(), r[x], r[x]);
1758                                                a->vpermq   (tmp(), tmp(), 0xd8);
1759                                                a->vpackuswb(tmp(), tmp(), tmp());
1760                                                a->vmovq    (arg[imm], (A::Xmm)tmp()); }
1761                                                break;
1762 
1763                 case Op::store16: if (scalar) { a->vpextrw  (arg[imm], (A::Xmm)r[x], 0); }
1764                                   else        { a->vpackusdw(tmp(), r[x], r[x]);
1765                                                 a->vpermq   (tmp(), tmp(), 0xd8);
1766                                                 a->vmovups  (arg[imm], (A::Xmm)tmp()); }
1767                                                 break;
1768 
1769                 case Op::store32: if (scalar) { a->vmovd  (arg[imm], (A::Xmm)r[x]); }
1770                                   else        { a->vmovups(arg[imm],         r[x]); }
1771                                                 break;
1772 
1773                 case Op::load8:  if (scalar) {
1774                                      a->vpxor  (dst(), dst(), dst());
1775                                      a->vpinsrb((A::Xmm)dst(), (A::Xmm)dst(), arg[imm], 0);
1776                                  } else {
1777                                      a->vpmovzxbd(dst(), arg[imm]);
1778                                  } break;
1779 
1780                 case Op::load16: if (scalar) {
1781                                      a->vpxor  (dst(), dst(), dst());
1782                                      a->vpinsrw((A::Xmm)dst(), (A::Xmm)dst(), arg[imm], 0);
1783                                  } else {
1784                                      a->vpmovzxwd(dst(), arg[imm]);
1785                                  } break;
1786 
1787                 case Op::load32: if (scalar) { a->vmovd  ((A::Xmm)dst(), arg[imm]); }
1788                                  else        { a->vmovups(        dst(), arg[imm]); }
1789                                  break;
1790 
1791                 case Op::uniform8: a->movzbl(A::rax, arg[imm&0xffff], imm>>16);
1792                                    a->vmovd_direct((A::Xmm)dst(), A::rax);
1793                                    a->vbroadcastss(dst(), (A::Xmm)dst());
1794                                    break;
1795 
1796                 case Op::uniform32: a->vbroadcastss(dst(), arg[imm&0xffff], imm>>16);
1797                                     break;
1798 
1799                 case Op::splat: a->vbroadcastss(dst(), &splats.find(imm)->label);
1800                                 break;
1801                                 // TODO: many of these instructions have variants that
1802                                 // can read one of their arugments from 32-byte memory
1803                                 // instead of a register.  Find a way to avoid needing
1804                                 // to splat most* constants out at all?
1805                                 // (*Might work for x - 255 but not 255 - x, so will
1806                                 // always need to be able to splat to a register.)
1807 
1808                 case Op::add_f32: a->vaddps(dst(), r[x], r[y]); break;
1809                 case Op::sub_f32: a->vsubps(dst(), r[x], r[y]); break;
1810                 case Op::mul_f32: a->vmulps(dst(), r[x], r[y]); break;
1811                 case Op::div_f32: a->vdivps(dst(), r[x], r[y]); break;
1812 
1813                 case Op::mad_f32:
1814                     if      (avail & (1<<r[x])) { set_dst(r[x]); a->vfmadd132ps(r[x], r[z], r[y]); }
1815                     else if (avail & (1<<r[y])) { set_dst(r[y]); a->vfmadd213ps(r[y], r[x], r[z]); }
1816                     else if (avail & (1<<r[z])) { set_dst(r[z]); a->vfmadd231ps(r[z], r[x], r[y]); }
1817                     else                        {                SkASSERT(dst() == tmp());
1818                                                                  a->vmovdqa    (dst(),r[x]);
1819                                                                  a->vfmadd132ps(dst(),r[z], r[y]); }
1820                                                                  break;
1821 
1822                 case Op::add_i32: a->vpaddd (dst(), r[x], r[y]); break;
1823                 case Op::sub_i32: a->vpsubd (dst(), r[x], r[y]); break;
1824                 case Op::mul_i32: a->vpmulld(dst(), r[x], r[y]); break;
1825 
1826                 case Op::sub_i16x2: a->vpsubw (dst(), r[x], r[y]); break;
1827                 case Op::mul_i16x2: a->vpmullw(dst(), r[x], r[y]); break;
1828                 case Op::shr_i16x2: a->vpsrlw (dst(), r[x],  imm); break;
1829 
1830                 case Op::bit_and  : a->vpand (dst(), r[x], r[y]); break;
1831                 case Op::bit_or   : a->vpor  (dst(), r[x], r[y]); break;
1832                 case Op::bit_xor  : a->vpxor (dst(), r[x], r[y]); break;
1833                 case Op::bit_clear: a->vpandn(dst(), r[y], r[x]); break;  // N.B. Y then X.
1834                 case Op::select   : a->vpblendvb(dst(), r[z], r[y], r[x]); break;
1835 
1836                 case Op::shl_i32: a->vpslld(dst(), r[x], imm); break;
1837                 case Op::shr_i32: a->vpsrld(dst(), r[x], imm); break;
1838                 case Op::sra_i32: a->vpsrad(dst(), r[x], imm); break;
1839 
1840                 case Op::eq_i32: a->vpcmpeqd(dst(), r[x], r[y]); break;
1841                 case Op::lt_i32: a->vpcmpgtd(dst(), r[y], r[x]); break;
1842                 case Op::gt_i32: a->vpcmpgtd(dst(), r[x], r[y]); break;
1843 
1844                 case Op::extract: if (imm == 0) { a->vpand (dst(),  r[x], r[y]); }
1845                                   else          { a->vpsrld(tmp(),  r[x], imm);
1846                                                   a->vpand (dst(), tmp(), r[y]); }
1847                                   break;
1848 
1849                 case Op::pack: a->vpslld(tmp(),  r[y], imm);
1850                                a->vpor  (dst(), tmp(), r[x]);
1851                                break;
1852 
1853                 case Op::to_f32: a->vcvtdq2ps (dst(), r[x]); break;
1854                 case Op::to_i32: a->vcvttps2dq(dst(), r[x]); break;
1855 
1856                 case Op::bytes: a->vpshufb(dst(), r[x], &bytes_masks.find(imm)->label);
1857                                 break;
1858 
1859             #elif defined(__aarch64__)
1860                 case Op::store8: a->xtns2h(tmp(), r[x]);
1861                                  a->xtnh2b(tmp(), tmp());
1862                    if (scalar) { a->strb  (tmp(), arg[imm]); }
1863                    else        { a->strs  (tmp(), arg[imm]); }
1864                                  break;
1865                 // TODO: another case where it'd be okay to alias r[x] and tmp if r[x] dies here.
1866 
1867                 case Op::store32: if (scalar) { a->strs(r[x], arg[imm]); }
1868                                   else        { a->strq(r[x], arg[imm]); }
1869                                                 break;
1870 
1871                 case Op::load8: if (scalar) { a->ldrb(tmp(), arg[imm]); }
1872                                 else        { a->ldrs(tmp(), arg[imm]); }
1873                                               a->uxtlb2h(tmp(), tmp());
1874                                               a->uxtlh2s(dst(), tmp());
1875                                               break;
1876 
1877                 case Op::load32: if (scalar) { a->ldrs(dst(), arg[imm]); }
1878                                  else        { a->ldrq(dst(), arg[imm]); }
1879                                                break;
1880 
1881                 case Op::splat: a->ldrq(dst(), &splats.find(imm)->label);
1882                                 break;
1883                                 // TODO: If we hoist these, pack 4 values in each register
1884                                 // and use vector/lane operations, cutting the register
1885                                 // pressure cost of hoisting by 4?
1886 
1887                 case Op::add_f32: a->fadd4s(dst(), r[x], r[y]); break;
1888                 case Op::sub_f32: a->fsub4s(dst(), r[x], r[y]); break;
1889                 case Op::mul_f32: a->fmul4s(dst(), r[x], r[y]); break;
1890                 case Op::div_f32: a->fdiv4s(dst(), r[x], r[y]); break;
1891 
1892                 case Op::mad_f32: // fmla4s is z += x*y
1893                     if (avail & (1<<r[z])) { set_dst(r[z]); a->fmla4s( r[z],  r[x],  r[y]);   }
1894                     else {                                  a->orr16b(tmp(),  r[z],  r[z]);
1895                                                             a->fmla4s(tmp(),  r[x],  r[y]);
1896                                        if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
1897                                                             break;
1898 
1899 
1900                 case Op::add_i32: a->add4s(dst(), r[x], r[y]); break;
1901                 case Op::sub_i32: a->sub4s(dst(), r[x], r[y]); break;
1902                 case Op::mul_i32: a->mul4s(dst(), r[x], r[y]); break;
1903 
1904                 case Op::sub_i16x2: a->sub8h (dst(), r[x], r[y]); break;
1905                 case Op::mul_i16x2: a->mul8h (dst(), r[x], r[y]); break;
1906                 case Op::shr_i16x2: a->ushr8h(dst(), r[x],  imm); break;
1907 
1908                 case Op::bit_and  : a->and16b(dst(), r[x], r[y]); break;
1909                 case Op::bit_or   : a->orr16b(dst(), r[x], r[y]); break;
1910                 case Op::bit_xor  : a->eor16b(dst(), r[x], r[y]); break;
1911                 case Op::bit_clear: a->bic16b(dst(), r[x], r[y]); break;
1912 
1913                 case Op::select: // bsl16b is x = x ? y : z
1914                     if (avail & (1<<r[x])) { set_dst(r[x]); a->bsl16b( r[x],  r[y],  r[z]); }
1915                     else {                                  a->orr16b(tmp(),  r[x],  r[x]);
1916                                                             a->bsl16b(tmp(),  r[y],  r[z]);
1917                                        if(dst() != tmp()) { a->orr16b(dst(), tmp(), tmp()); } }
1918                                                             break;
1919 
1920                 case Op::shl_i32: a-> shl4s(dst(), r[x], imm); break;
1921                 case Op::shr_i32: a->ushr4s(dst(), r[x], imm); break;
1922                 case Op::sra_i32: a->sshr4s(dst(), r[x], imm); break;
1923 
1924                 case Op::eq_i32: a->cmeq4s(dst(), r[x], r[y]); break;
1925                 case Op::lt_i32: a->cmgt4s(dst(), r[y], r[x]); break;
1926                 case Op::gt_i32: a->cmgt4s(dst(), r[x], r[y]); break;
1927 
1928                 case Op::extract: if (imm) { a->ushr4s(tmp(), r[x], imm);
1929                                              a->and16b(dst(), tmp(), r[y]); }
1930                                   else     { a->and16b(dst(), r[x], r[y]); }
1931                                              break;
1932 
1933                 case Op::pack:
1934                     if (avail & (1<<r[x])) { set_dst(r[x]); a->sli4s ( r[x],  r[y],  imm); }
1935                     else                   {                a->shl4s (tmp(),  r[y],  imm);
1936                                                             a->orr16b(dst(), tmp(), r[x]); }
1937                                                             break;
1938 
1939                 case Op::to_f32: a->scvtf4s (dst(), r[x]); break;
1940                 case Op::to_i32: a->fcvtzs4s(dst(), r[x]); break;
1941 
1942                 case Op::bytes:
1943                     if (try_hoisting) { a->tbl (dst(), r[x], bytes_masks.find(imm)->reg); }
1944                     else              { a->ldrq(tmp(), &bytes_masks.find(imm)->label);
1945                                         a->tbl (dst(), r[x], tmp()); }
1946                                         break;
1947             #endif
1948             }
1949 
1950             // Calls to tmp() or dst() might have flipped this false from its default true state.
1951             return ok;
1952         };
1953 
1954 
1955         #if defined(__x86_64__)
1956             const int K = 8;
1957             auto jump_if_less = [&](A::Label* l) { a->jl (l); };
1958             auto jump         = [&](A::Label* l) { a->jmp(l); };
1959 
1960             auto add = [&](A::GP64 gp, int imm) { a->add(gp, imm); };
1961             auto sub = [&](A::GP64 gp, int imm) { a->sub(gp, imm); };
1962 
1963             auto exit = [&]{ a->vzeroupper(); a->ret(); };
1964         #elif defined(__aarch64__)
1965             const int K = 4;
1966             auto jump_if_less = [&](A::Label* l) { a->blt(l); };
1967             auto jump         = [&](A::Label* l) { a->b  (l); };
1968 
1969             auto add = [&](A::X gp, int imm) { a->add(gp, gp, imm); };
1970             auto sub = [&](A::X gp, int imm) { a->sub(gp, gp, imm); };
1971 
1972             auto exit = [&]{ a->ret(A::x30); };
1973         #endif
1974 
1975         A::Label body,
1976                  tail,
1977                  done;
1978 
1979         for (Val id = 0; id < (Val)instructions.size(); id++) {
1980             if (!warmup(id)) {
1981                 return false;
1982             }
1983             if (hoisted(id) && !emit(id, /*scalar=*/false)) {
1984                 return false;
1985             }
1986         }
1987 
1988         a->label(&body);
1989         {
1990             a->cmp(N, K);
1991             jump_if_less(&tail);
1992             for (Val id = 0; id < (Val)instructions.size(); id++) {
1993                 if (!hoisted(id) && !emit(id, /*scalar=*/false)) {
1994                     return false;
1995                 }
1996             }
1997             for (int i = 0; i < (int)fStrides.size(); i++) {
1998                 if (fStrides[i]) {
1999                     add(arg[i], K*fStrides[i]);
2000                 }
2001             }
2002             sub(N, K);
2003             jump(&body);
2004         }
2005 
2006         a->label(&tail);
2007         {
2008             a->cmp(N, 1);
2009             jump_if_less(&done);
2010             for (Val id = 0; id < (Val)instructions.size(); id++) {
2011                 if (!hoisted(id) && !emit(id, /*scalar=*/true)) {
2012                     return false;
2013                 }
2014             }
2015             for (int i = 0; i < (int)fStrides.size(); i++) {
2016                 if (fStrides[i]) {
2017                     add(arg[i], 1*fStrides[i]);
2018                 }
2019             }
2020             sub(N, 1);
2021             jump(&tail);
2022         }
2023 
2024         a->label(&done);
2025         {
2026             exit();
2027         }
2028 
2029         bytes_masks.foreach([&](int imm, LabelAndReg* entry) {
2030             // One 16-byte pattern for ARM tbl, that same pattern twice for x86-64 vpshufb.
2031         #if defined(__x86_64__)
2032             a->align(32);
2033         #elif defined(__aarch64__)
2034             a->align(4);
2035         #endif
2036 
2037             a->label(&entry->label);
2038             int mask[4];
2039             bytes_control(imm, mask);
2040             a->bytes(mask, sizeof(mask));
2041         #if defined(__x86_64__)
2042             a->bytes(mask, sizeof(mask));
2043         #endif
2044         });
2045 
2046         splats.foreach([&](int imm, LabelAndReg* entry) {
2047             // vbroadcastss 4 bytes on x86-64, or simply load 16-bytes on aarch64.
2048             a->align(4);
2049             a->label(&entry->label);
2050             a->word(imm);
2051         #if defined(__aarch64__)
2052             a->word(imm);
2053             a->word(imm);
2054             a->word(imm);
2055         #endif
2056         });
2057 
2058         return true;
2059     }
2060 
setupJIT(const std::vector<Builder::Instruction> & instructions,const char * debug_name)2061     void Program::setupJIT(const std::vector<Builder::Instruction>& instructions,
2062                            const char* debug_name) {
2063         // Assemble with no buffer to determine a.size(), the number of bytes we'll assemble.
2064         Assembler a{nullptr};
2065 
2066         // First try allowing code hoisting (faster code)
2067         // then again without if that fails (lower register pressure).
2068         bool try_hoisting = true;
2069         if (!this->jit(instructions, try_hoisting, &a)) {
2070             try_hoisting = false;
2071             if (!this->jit(instructions, try_hoisting, &a)) {
2072                 return;
2073             }
2074         }
2075 
2076         // Allocate space that we can remap as executable.
2077         const size_t page = sysconf(_SC_PAGESIZE);
2078         fJITSize = ((a.size() + page - 1) / page) * page;  // mprotect works at page granularity.
2079         fJITBuf = mmap(nullptr,fJITSize, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1,0);
2080 
2081         // Assemble the program for real.
2082         a = Assembler{fJITBuf};
2083         SkAssertResult(this->jit(instructions, try_hoisting, &a));
2084         SkASSERT(a.size() <= fJITSize);
2085 
2086         // Remap as executable, and flush caches on platforms that need that.
2087         mprotect(fJITBuf, fJITSize, PROT_READ|PROT_EXEC);
2088         __builtin___clear_cache((char*)fJITBuf,
2089                                 (char*)fJITBuf + fJITSize);
2090     #if defined(SKVM_PERF_DUMPS)
2091         this->dumpJIT(debug_name, a.size());
2092     #endif
2093     }
2094 #endif
2095 
2096 #if defined(SKVM_PERF_DUMPS)
dumpJIT(const char * debug_name,size_t size) const2097     void Program::dumpJIT(const char* debug_name, size_t size) const {
2098     #if 0 && defined(__aarch64__)
2099         if (debug_name) {
2100             SkDebugf("\n%s:", debug_name);
2101         }
2102         // cat | llvm-mc -arch aarch64 -disassemble
2103         auto cur = (const uint8_t*)fJITBuf;
2104         for (int i = 0; i < (int)size; i++) {
2105             if (i % 4 == 0) {
2106                 SkDebugf("\n");
2107             }
2108             SkDebugf("0x%02x ", *cur++);
2109         }
2110         SkDebugf("\n");
2111     #endif
2112 
2113         // We're doing some really stateful things below so one thread at a time please...
2114         static SkSpinlock dump_lock;
2115         SkAutoSpinlock lock(dump_lock);
2116 
2117         auto fnv1a = [](const void* vbuf, size_t n) {
2118             uint32_t hash = 2166136261;
2119             for (auto buf = (const uint8_t*)vbuf; n --> 0; buf++) {
2120                 hash ^= *buf;
2121                 hash *= 16777619;
2122             }
2123             return hash;
2124         };
2125 
2126 
2127         char name[64];
2128         uint32_t hash = fnv1a(fJITBuf, size);
2129         if (debug_name) {
2130             sprintf(name, "skvm-jit-%s", debug_name);
2131         } else {
2132             sprintf(name, "skvm-jit-%u", hash);
2133         }
2134 
2135         // Create a jit-<pid>.dump file that we can `perf inject -j` into a
2136         // perf.data captured with `perf record -k 1`, letting us see each
2137         // JIT'd Program as if a function named skvm-jit-<hash>.   E.g.
2138         //
2139         //   ninja -C out nanobench
2140         //   perf record -k 1 out/nanobench -m SkVM_4096_I32\$
2141         //   perf inject -j -i perf.data -o perf.data.jit
2142         //   perf report -i perf.data.jit
2143         //
2144         // Running `perf inject -j` will also dump an .so for each JIT'd
2145         // program, named jitted-<pid>-<hash>.so.
2146         //
2147         //    https://lwn.net/Articles/638566/
2148         //    https://v8.dev/docs/linux-perf
2149         //    https://cs.chromium.org/chromium/src/v8/src/diagnostics/perf-jit.cc
2150         //    https://lore.kernel.org/patchwork/patch/622240/
2151 
2152 
2153         auto timestamp_ns = []() -> uint64_t {
2154             // It's important to use CLOCK_MONOTONIC here so that perf can
2155             // correlate our timestamps with those captured by `perf record
2156             // -k 1`.  That's also what `-k 1` does, by the way, tell perf
2157             // record to use CLOCK_MONOTONIC.
2158             struct timespec ts;
2159             clock_gettime(CLOCK_MONOTONIC, &ts);
2160             return ts.tv_sec * (uint64_t)1e9 + ts.tv_nsec;
2161         };
2162 
2163         // We'll open the jit-<pid>.dump file and write a small header once,
2164         // and just leave it open forever because we're lazy.
2165         static FILE* jitdump = [&]{
2166             // Must map as w+ for the mmap() call below to work.
2167             char path[64];
2168             sprintf(path, "jit-%d.dump", getpid());
2169             FILE* f = fopen(path, "w+");
2170 
2171             // Calling mmap() on the file adds a "hey they mmap()'d this" record to
2172             // the perf.data file that will point `perf inject -j` at this log file.
2173             // Kind of a strange way to tell `perf inject` where the file is...
2174             void* marker = mmap(nullptr, sysconf(_SC_PAGESIZE),
2175                                 PROT_READ|PROT_EXEC, MAP_PRIVATE,
2176                                 fileno(f), /*offset=*/0);
2177             SkASSERT_RELEASE(marker != MAP_FAILED);
2178             // Like never calling fclose(f), we'll also just always leave marker mmap()'d.
2179 
2180         #if defined(__x86_64__)
2181             const uint32_t elf_mach = 62;
2182         #elif defined(__aarch64__)
2183             const uint32_t elf_mach = 183;
2184         #endif
2185 
2186             struct Header {
2187                 uint32_t magic, version, header_size, elf_mach, reserved, pid;
2188                 uint64_t timestamp_us, flags;
2189             } header = {
2190                 0x4A695444, 1, sizeof(Header), elf_mach, 0, (uint32_t)getpid(),
2191                 timestamp_ns() / 1000, 0,
2192             };
2193             fwrite(&header, sizeof(header), 1, f);
2194 
2195             return f;
2196         }();
2197 
2198         struct CodeLoad {
2199             uint32_t event_type, event_size;
2200             uint64_t timestamp_ns;
2201 
2202             uint32_t pid, tid;
2203             uint64_t vma/*???*/, code_addr, code_size, id;
2204         } load = {
2205             0/*code load*/, (uint32_t)(sizeof(CodeLoad) + strlen(name) + 1 + size),
2206             timestamp_ns(),
2207 
2208             (uint32_t)getpid(), (uint32_t)SkGetThreadID(),
2209             (uint64_t)fJITBuf, (uint64_t)fJITBuf, size, hash,
2210         };
2211 
2212         // Write the header, the JIT'd function name, and the JIT'd code itself.
2213         fwrite(&load, sizeof(load), 1, jitdump);
2214         fwrite(name, 1, strlen(name), jitdump);
2215         fwrite("\0", 1, 1, jitdump);
2216         fwrite(fJITBuf, 1, size, jitdump);
2217     }
2218 #endif
2219 
2220 
2221 }  // namespace skvm
2222