1 /*
2  * Copyright 2020 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/private/GrTypesPriv.h" // GrAlignTo
9 #include "src/core/SkUtils.h" // sk_unaligned_load
10 #include "src/sksl/SkSLByteCode.h"
11 #include "src/sksl/SkSLExternalValue.h"
12 
13 #include <stack>
14 
15 #ifndef SKSL_INTERPRETER
16 #define SKSL_INTERPRETER
17 
18 namespace SkSL {
19 
20 // GCC and Clang support the "labels as values" extension which we need to implement the interpreter
21 // using threaded code. Otherwise, we fall back to using a switch statement in a for loop.
22 #if defined(__GNUC__) || defined(__clang__)
23     #define SKSL_THREADED_CODE
24 #endif
25 
26 #ifdef SKSL_THREADED_CODE
27     #define LABEL(name) name:
28     #ifdef TRACE
29         #define NEXT()                                   \
30             {                                            \
31                 const uint8_t* trace_ip = ip;            \
32                 printf("%d: ", (int) (trace_ip - code)); \
33                 disassemble(&trace_ip);                  \
34             }                                            \
35             goto *labels[(int) read<ByteCode::Instruction>(&ip)]
36     #else
37         #define NEXT() goto *labels[(int) read<ByteCode::Instruction>(&ip)]
38     #endif
39 #else
40     #define LABEL(name) case ByteCode::Instruction::name:
41     #define NEXT() continue
42 #endif
43 
44 // If you trip this assert, it means that the order of the opcodes listed in ByteCodeInstruction
45 // does not match the order of the opcodes listed in the 'labels' array in innerRun().
46 #define CHECK_LABEL(name) \
47     SkASSERT(labels[(int) ByteCode::Instruction::name] == &&name)
48 
49 template<typename T>
read(const uint8_t ** ip)50 static T read(const uint8_t** ip) {
51     *ip += sizeof(T);
52     return sk_unaligned_load<T>(*ip - sizeof(T));
53 }
54 
55 #define BINARY_OP(inst, src, result, op)                                  \
56     LABEL(inst) {                                                         \
57         ByteCode::Register target = read<ByteCode::Register>(&ip);        \
58         ByteCode::Register src1 = read<ByteCode::Register>(&ip);          \
59         ByteCode::Register src2 = read<ByteCode::Register>(&ip);          \
60         fRegisters[target.fIndex].result = fRegisters[src1.fIndex].src op \
61                                            fRegisters[src2.fIndex].src;   \
62         NEXT();                                                           \
63     }
64 
65 #define MASKED_BINARY_OP(inst, src, result, op)                                         \
66     LABEL(inst) {                                                                       \
67         ByteCode::Register target = read<ByteCode::Register>(&ip);                      \
68         ByteCode::Register src1 = read<ByteCode::Register>(&ip);                        \
69         ByteCode::Register src2 = read<ByteCode::Register>(&ip);                        \
70         auto m = mask();                                                                \
71         for (int i = 0; i < width; ++i) {                                               \
72             if (m[i]) {                                                                 \
73                 fRegisters[target.fIndex].result[i] = fRegisters[src1.fIndex].src[i] op \
74                                                    fRegisters[src2.fIndex].src[i];      \
75             }                                                                           \
76         }                                                                               \
77         NEXT();                                                                         \
78     }
79 
80 #define MASKED_VECTOR_BINARY_OP(inst, src, result, op)                                             \
81     LABEL(inst) {                                                                                  \
82         ByteCode::Register target = read<ByteCode::Register>(&ip);                                 \
83         ByteCode::Register src1 = read<ByteCode::Register>(&ip);                                   \
84         ByteCode::Register src2 = read<ByteCode::Register>(&ip);                                   \
85         auto m = mask();                                                                           \
86         for (int i = 0; i < width; ++i) {                                                          \
87             if (m[i]) {                                                                            \
88                 fRegisters[target.fIndex].result[i] = fRegisters[src1.fIndex].src[i] op            \
89                                                       fRegisters[src2.fIndex].src[i];              \
90             }                                                                                      \
91         }                                                                                          \
92         NEXT();                                                                                    \
93     }                                                                                              \
94     LABEL(inst ## N) {                                                                             \
95         uint8_t count = read<uint8_t>(&ip);                                                        \
96         ByteCode::Register target = read<ByteCode::Register>(&ip);                                 \
97         ByteCode::Register src1 = read<ByteCode::Register>(&ip);                                   \
98         ByteCode::Register src2 = read<ByteCode::Register>(&ip);                                   \
99         auto m = mask();                                                                           \
100         for (int i = 0; i < width; ++i) {                                                          \
101             if (m[i]) {                                                                            \
102                 for (int j = 0; j < count; ++j) {                                                  \
103                     fRegisters[target.fIndex + j].result[i] = fRegisters[src1.fIndex + j].src[i]   \
104                                                             op fRegisters[src2.fIndex + j].src[i]; \
105                 }                                                                                  \
106             }                                                                                      \
107         }                                                                                          \
108         NEXT();                                                                                    \
109     }
110 
111 #define VECTOR_BINARY_OP(inst, src, result, op)                                       \
112     LABEL(inst) {                                                                     \
113         ByteCode::Register target = read<ByteCode::Register>(&ip);                    \
114         ByteCode::Register src1 = read<ByteCode::Register>(&ip);                      \
115         ByteCode::Register src2 = read<ByteCode::Register>(&ip);                      \
116         fRegisters[target.fIndex].result = fRegisters[src1.fIndex].src op             \
117                                                fRegisters[src2.fIndex].src;           \
118         NEXT();                                                                       \
119     }                                                                                 \
120     LABEL(inst ## N) {                                                                \
121         uint8_t count = read<uint8_t>(&ip);                                           \
122         ByteCode::Register target = read<ByteCode::Register>(&ip);                    \
123         ByteCode::Register src1 = read<ByteCode::Register>(&ip);                      \
124         ByteCode::Register src2 = read<ByteCode::Register>(&ip);                      \
125         for (int i = 0; i < count; ++i) {                                             \
126             fRegisters[target.fIndex + i].result = fRegisters[src1.fIndex + i].src op \
127                                                    fRegisters[src2.fIndex + i].src;   \
128         }                                                                             \
129         NEXT();                                                                       \
130     }
131 
132 #define VECTOR_UNARY_FN(inst, fn)                                                       \
133     LABEL(inst) {                                                                       \
134         ByteCode::Register target = read<ByteCode::Register>(&ip);                      \
135         ByteCode::Register src = read<ByteCode::Register>(&ip);                         \
136         for (int i = 0; i < width; ++ i) {                                              \
137             fRegisters[target.fIndex].fFloat[i] = fn(fRegisters[src.fIndex].fFloat[i]); \
138         }                                                                               \
139         NEXT();                                                                         \
140     }
141 
142 #define DISASSEMBLE_0(inst, name) \
143     case ByteCode::Instruction::inst: printf(name "\n"); break;
144 
145 #define DISASSEMBLE_1(inst, name)                                   \
146     case ByteCode::Instruction::inst:                               \
147         printf(name " $%d\n", read<ByteCode::Register>(ip).fIndex); \
148         break;
149 
150 #define DISASSEMBLE_UNARY(inst, name)                             \
151     case ByteCode::Instruction::inst: {                           \
152         ByteCode::Register target = read<ByteCode::Register>(ip); \
153         ByteCode::Register src = read<ByteCode::Register>(ip);    \
154         printf(name " $%d -> $%d\n", src.fIndex, target.fIndex);  \
155         break;                                                    \
156     }
157 
158 #define DISASSEMBLE_VECTOR_UNARY(inst, name)                              \
159     case ByteCode::Instruction::inst: {                                   \
160         ByteCode::Register target = read<ByteCode::Register>(ip);         \
161         ByteCode::Register src = read<ByteCode::Register>(ip);            \
162         printf(name " $%d -> $%d\n", src.fIndex, target.fIndex);          \
163         break;                                                            \
164     }                                                                     \
165     case ByteCode::Instruction::inst ## N: {                              \
166         uint8_t count = read<uint8_t>(ip);                                \
167         ByteCode::Register target = read<ByteCode::Register>(ip);         \
168         ByteCode::Register src = read<ByteCode::Register>(ip);            \
169         printf(name "%d $%d -> $%d\n", count, src.fIndex, target.fIndex); \
170         break;                                                            \
171     }
172 
173 #define DISASSEMBLE_BINARY(inst, name)                                              \
174     case ByteCode::Instruction::inst: {                                             \
175         ByteCode::Register target = read<ByteCode::Register>(ip);                   \
176         ByteCode::Register src1 = read<ByteCode::Register>(ip);                     \
177         ByteCode::Register src2 = read<ByteCode::Register>(ip);                     \
178         printf(name " $%d, $%d -> $%d\n", src1.fIndex, src2.fIndex, target.fIndex); \
179         break;                                                                      \
180     }
181 
182 #define DISASSEMBLE_VECTOR_BINARY(inst, name)                                                \
183     case ByteCode::Instruction::inst: {                                                      \
184         ByteCode::Register target = read<ByteCode::Register>(ip);                            \
185         ByteCode::Register src1 = read<ByteCode::Register>(ip);                              \
186         ByteCode::Register src2 = read<ByteCode::Register>(ip);                              \
187         printf(name " $%d, $%d -> $%d\n", src1.fIndex, src2.fIndex, target.fIndex);          \
188         break;                                                                               \
189     }                                                                                        \
190     case ByteCode::Instruction::inst ## N: {                                                 \
191         uint8_t count = read<uint8_t>(ip);                                                   \
192         ByteCode::Register target = read<ByteCode::Register>(ip);                            \
193         ByteCode::Register src1 = read<ByteCode::Register>(ip);                              \
194         ByteCode::Register src2 = read<ByteCode::Register>(ip);                              \
195         printf(name "%d $%d, $%d -> $%d\n", count, src1.fIndex, src2.fIndex, target.fIndex); \
196         break;                                                                               \
197     }
198 
199 /**
200  * Operates on vectors of the specified width, so creating an Interpreter<16> means that all inputs,
201  * outputs, and internal calculations will be 16-wide vectors.
202  */
203 template<int width>
204 class Interpreter {
205 public:
206     using Vector = ByteCode::Vector<width>;
207     using VectorI = skvx::Vec<width, int32_t>;
208     using VectorF = skvx::Vec<width, float>;
209 
Interpreter(std::unique_ptr<ByteCode> code)210     Interpreter(std::unique_ptr<ByteCode> code)
211         : fCode(std::move(code)) {
212         // C++ doesn't guarantee proper alignment of naively-allocated vectors, so we can't have the
213         // registers and memory directly as fields of this object without jumping through some hoops
214         // during Interpreter allocation and deallocation. We simplify this by having the backing
215         // store be a separate allocation, jumping through the hoops ourselves rather than require
216         // Interpreter's clients to be aware of alignment.
217         // Ideally, we could use std::aligned_alloc here, but as of this writing it is not available
218         // on some compilers despite claiming to support C++17.
219         fBackingStore = calloc(sizeof(Vector), MEMORY_SIZE + REGISTER_COUNT + 1);
220         fMemory = (Vector*) GrAlignTo((size_t) fBackingStore, alignof(Vector));
221         fRegisters = fMemory + MEMORY_SIZE;
222     }
223 
~Interpreter()224     ~Interpreter() {
225         free(fBackingStore);
226     }
227 
setUniforms(const float uniforms[])228     void setUniforms(const float uniforms[]) {
229         for (int i = 0; i < fCode->getUniformSlotCount(); ++i) {
230             fMemory[fCode->getGlobalSlotCount() + i].fFloat = VectorF(uniforms[i]);
231         }
232     }
233 
234     /**
235      * Returns true on success and stores a pointer to the first slot of the result into outResult.
236      * This pointer is only guaranteed to be valid until the next run() call.
237      */
run(const ByteCodeFunction * f,Vector args[],Vector ** outResult)238      bool run(const ByteCodeFunction* f, Vector args[], Vector** outResult) {
239         SkASSERT(f);
240         VectorI condStack[MASK_STACK_SIZE];
241         memset(&condStack[0], 255, sizeof(condStack[0]));
242         VectorI maskStack[MASK_STACK_SIZE];
243         memset(&maskStack[0], 255, sizeof(maskStack[0]));
244         VectorI loopStack[LOOP_STACK_SIZE];
245         memset(&loopStack[0], 255, sizeof(loopStack[0]));
246         VectorI continueStack[LOOP_STACK_SIZE];
247         memset(&continueStack[0], 0, sizeof(continueStack[0]));
248         Vector* stack = fMemory + MEMORY_SIZE;
249         int stackCount = f->fStackSlotCount + f->fParameterSlotCount;
250         stack -= stackCount;
251         if (f->fParameterSlotCount) {
252             memcpy(stack, args, f->fParameterSlotCount * sizeof(Vector));
253         }
254         Context context(fMemory, stack, condStack, maskStack, loopStack, continueStack);
255         if (this->innerRun(f, context, 0, outResult)) {
256             int slot = 0;
257             for (const auto& p : f->fParameters) {
258                 if (p.fIsOutParameter) {
259                     memcpy(&args[slot], &stack[slot], p.fSlotCount * sizeof(Vector));
260                 }
261                 slot += p.fSlotCount;
262             }
263             return true;
264         }
265         return false;
266     }
267 
268     /**
269      * Invokes the specified function with the given arguments, 'count' times. 'args' and
270      * 'outResult' are accepted and returned in structure-of-arrays form:
271      *   args[0] points to an array of N values, the first argument for each invocation
272      *   ...
273      *   args[argCount - 1] points to an array of N values, the last argument for each invocation
274      *
275      * All values in 'args', 'outResult', and 'uniforms' are 32-bit values (typically floats,
276      * but possibly int32_t or uint32_t, depending on the types used in the SkSL).
277      * Any 'out' or 'inout' parameters will result in the 'args' array being modified.
278      */
279     bool runStriped(const ByteCodeFunction* f, int count, float* args[],
280                     float* outResult[] = nullptr) {
281         SkASSERT(f);
282         Vector* stack = fMemory + MEMORY_SIZE;
283         int stackCount = f->fStackSlotCount + f->fParameterSlotCount;
284         stack -= stackCount;
285         VectorI condStack[MASK_STACK_SIZE];
286         VectorI maskStack[MASK_STACK_SIZE];
287         VectorI loopStack[LOOP_STACK_SIZE];
288         VectorI continueStack[LOOP_STACK_SIZE];
289         Vector* innerResult = nullptr;
290         Context context(fMemory, stack, condStack, maskStack, loopStack, continueStack);
291         for (int i = 0; i < count; i += width) {
292             int lanes = std::min(width, count - i);
293             size_t size = lanes * sizeof(float);
294             memset(&maskStack[0], 255, sizeof(maskStack[0]));
295             memset(&loopStack[0], 255, sizeof(loopStack[0]));
296             for (int j = lanes; j < width; ++j) {
297                 maskStack[0][j] = 0;
298                 loopStack[0][j] = 0;
299             }
300             memset(&continueStack[0], 0, sizeof(continueStack[0]));
301             for (int j = 0; j < f->fParameterSlotCount; ++j) {
302                 memcpy(stack + j, &args[j][i], size);
303             }
304             if (!this->innerRun(f, context, i, &innerResult)) {
305                 return false;
306             }
307             int slot = 0;
308             for (const auto& p : f->fParameters) {
309                 if (p.fIsOutParameter) {
310                     for (int j = 0; j < p.fSlotCount; ++j) {
311                         memcpy(&args[slot + j][i], stack + slot + j, size);
312                     }
313                 }
314                 slot += p.fSlotCount;
315             }
316             if (outResult) {
317                 for (int j = 0; j < f->fReturnSlotCount; ++j) {
318                     memcpy(&outResult[j][i], &innerResult[j], size);
319                 }
320             }
321         }
322         return true;
323     }
324 
getCode()325     const ByteCode& getCode() {
326         return *fCode;
327     }
328 
329 private:
330     static constexpr size_t REGISTER_COUNT = 1024;
331 
332     static constexpr size_t MEMORY_SIZE = 1024;
333 
334     static constexpr size_t MASK_STACK_SIZE = 64;
335 
336     static constexpr size_t LOOP_STACK_SIZE = 16;
337 
338     struct StackFrame {
StackFrameStackFrame339         StackFrame(const ByteCodeFunction* function, const uint8_t* ip, const int stackSlotCount,
340                    Vector* parameters, Vector* returnValue)
341             : fFunction(function)
342             , fIP(ip)
343             , fStackSlotCount(stackSlotCount)
344             , fParameters(parameters)
345             , fReturnValue(returnValue) {}
346 
347         const ByteCodeFunction* fFunction;
348         const uint8_t* fIP;
349         const int fStackSlotCount;
350         Vector* fParameters;
351         Vector* fReturnValue;
352     };
353 
354     struct Context {
ContextContext355         Context(Vector* memory, Vector* stack, VectorI* condStack, VectorI* maskStack,
356                 VectorI* loopStack,VectorI* continueStack)
357             : fMemory(memory)
358             , fStack(stack)
359             , fCondStack(condStack)
360             , fMaskStack(maskStack)
361             , fLoopStack(loopStack)
362             , fContinueStack(continueStack) {}
363 
364         Vector* fMemory;
365         Vector* fStack;
366         VectorI* fCondStack;
367         VectorI* fMaskStack;
368         VectorI* fLoopStack;
369         VectorI* fContinueStack;
370         std::stack<StackFrame> fCallStack;
371     };
372 
373     // $x = register
374     // @x = memory cell
375     // &x = parameter
disassemble(const uint8_t ** ip)376     void disassemble(const uint8_t** ip) {
377         ByteCode::Instruction inst = read<ByteCode::Instruction>(ip);
378         switch (inst) {
379             DISASSEMBLE_VECTOR_BINARY(kAddF, "addF")
380             DISASSEMBLE_VECTOR_BINARY(kAddI, "addI")
381             DISASSEMBLE_BINARY(kAnd, "and")
382             DISASSEMBLE_BINARY(kCompareEQF, "compare eqF")
383             DISASSEMBLE_BINARY(kCompareEQI, "compare eqI")
384             DISASSEMBLE_BINARY(kCompareNEQF, "compare neqF")
385             DISASSEMBLE_BINARY(kCompareNEQI, "compare neqI")
386             DISASSEMBLE_BINARY(kCompareGTF, "compare gtF")
387             DISASSEMBLE_BINARY(kCompareGTS, "compare gtS")
388             DISASSEMBLE_BINARY(kCompareGTU, "compare gtU")
389             DISASSEMBLE_BINARY(kCompareGTEQF, "compare gteqF")
390             DISASSEMBLE_BINARY(kCompareGTEQS, "compare gteqS")
391             DISASSEMBLE_BINARY(kCompareGTEQU, "compare gteqU")
392             DISASSEMBLE_BINARY(kCompareLTF, "compare ltF")
393             DISASSEMBLE_BINARY(kCompareLTS, "compare ltS")
394             DISASSEMBLE_BINARY(kCompareLTU, "compare ltU")
395             DISASSEMBLE_BINARY(kCompareLTEQF, "compare lteqF")
396             DISASSEMBLE_BINARY(kCompareLTEQS, "compare lteqS")
397             DISASSEMBLE_BINARY(kCompareLTEQU, "compare lteqU")
398             DISASSEMBLE_VECTOR_BINARY(kSubtractF, "subF")
399             DISASSEMBLE_VECTOR_BINARY(kSubtractI, "subI")
400             DISASSEMBLE_VECTOR_BINARY(kDivideF, "divF")
401             DISASSEMBLE_VECTOR_BINARY(kDivideS, "divS")
402             DISASSEMBLE_VECTOR_BINARY(kDivideU, "divU")
403             DISASSEMBLE_VECTOR_BINARY(kRemainderS, "remS")
404             DISASSEMBLE_VECTOR_BINARY(kRemainderU, "remU")
405             DISASSEMBLE_VECTOR_BINARY(kRemainderF, "remF")
406             DISASSEMBLE_VECTOR_BINARY(kMultiplyF, "mulF")
407             DISASSEMBLE_VECTOR_BINARY(kMultiplyI, "mulI")
408             DISASSEMBLE_BINARY(kOr, "or")
409             DISASSEMBLE_BINARY(kXor, "xor")
410             DISASSEMBLE_0(kNop, "nop")
411             case ByteCode::Instruction::kBoundsCheck: {
412                 ByteCode::Register r = read<ByteCode::Register>(ip);
413                 int length = read<int>(ip);
414                 printf("boundsCheck 0 <= $%d < %d\n", r.fIndex, length);
415                 break;
416             }
417             case ByteCode::Instruction::kBranch:
418                 printf("branch %d\n", read<ByteCode::Pointer>(ip).fAddress);
419                 break;
420             case ByteCode::Instruction::kBranchIfAllFalse:
421                 printf("branchIfAllFalse %d\n", read<ByteCode::Pointer>(ip).fAddress);
422                 break;
423             DISASSEMBLE_0(kBreak, "break")
424             case ByteCode::Instruction::kCall: {
425                 ByteCode::Register target = read<ByteCode::Register>(ip);
426                 uint8_t idx = read<uint8_t>(ip);
427                 ByteCode::Register args = read<ByteCode::Register>(ip);
428                 ByteCodeFunction* f = fCode->fFunctions[idx].get();
429                 printf("call %s($%d...) -> $%d", f->fName.c_str(), args.fIndex, target.fIndex);
430                 printf("\n");
431                 break;
432             }
433             case ByteCode::Instruction::kCallExternal: {
434                 ByteCode::Register target = read<ByteCode::Register>(ip);
435                 uint8_t idx = read<uint8_t>(ip);
436                 uint8_t targetCount = read<uint8_t>(ip);
437                 ByteCode::Register args = read<ByteCode::Register>(ip);
438                 uint8_t argCount = read<uint8_t>(ip);
439                 ExternalValue* ev = fCode->fExternalValues[idx];
440                 printf("callExternal %s($%d(%d)...) -> $%d(%d)", String(ev->fName).c_str(),
441                         args.fIndex, argCount, target.fIndex, targetCount);
442                 printf("\n");
443                 break;
444             }
445             DISASSEMBLE_0(kContinue, "continue")
446             DISASSEMBLE_UNARY(kCopy, "copy")
447             DISASSEMBLE_UNARY(kCos, "cos")
448             DISASSEMBLE_UNARY(kFloatToSigned, "FtoS")
449             DISASSEMBLE_UNARY(kFloatToUnsigned, "FtoU")
450             case ByteCode::Instruction::kImmediate: {
451                 ByteCode::Register target = read<ByteCode::Register>(ip);
452                 ByteCode::Immediate src = read<ByteCode::Immediate>(ip);
453                 printf("immediate (%d | %f) -> $%d\n", src.fInt, src.fFloat, target.fIndex);
454                 break;
455             }
456             DISASSEMBLE_UNARY(kInverse2x2, "inverse2x2")
457             DISASSEMBLE_UNARY(kInverse3x3, "inverse3x3")
458             DISASSEMBLE_UNARY(kInverse4x4, "inverse4x4")
459             DISASSEMBLE_VECTOR_UNARY(kLoad, "load")
460             case ByteCode::Instruction::kLoadDirect: {
461                 ByteCode::Register target = read<ByteCode::Register>(ip);
462                 ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
463                 printf("loadDirect @%d -> $%d\n", src.fAddress, target.fIndex);
464                 break;
465             }
466             case ByteCode::Instruction::kLoadDirectN: {
467                 uint8_t count = read<uint8_t>(ip);
468                 ByteCode::Register target = read<ByteCode::Register>(ip);
469                 ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
470                 printf("loadDirect%d @%d -> $%d\n", count, src.fAddress, target.fIndex);
471                 break;
472             }
473             DISASSEMBLE_VECTOR_UNARY(kLoadParameter, "loadParameter")
474             case ByteCode::Instruction::kLoadParameterDirect: {
475                 ByteCode::Register target = read<ByteCode::Register>(ip);
476                 ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
477                 printf("loadParameterDirect &%d -> $%d\n", src.fAddress, target.fIndex);
478                 break;
479             }
480             case ByteCode::Instruction::kLoadParameterDirectN: {
481                 uint8_t count = read<uint8_t>(ip);
482                 ByteCode::Register target = read<ByteCode::Register>(ip);
483                 ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
484                 printf("loadParameterDirect%d &%d -> $%d\n", count, src.fAddress, target.fIndex);
485                 break;
486             }
487             DISASSEMBLE_VECTOR_UNARY(kLoadStack, "loadStack")
488             case ByteCode::Instruction::kLoadStackDirect: {
489                 ByteCode::Register target = read<ByteCode::Register>(ip);
490                 ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
491                 printf("loadStackDirect @%d -> $%d\n", src.fAddress, target.fIndex);
492                 break;
493             }
494             case ByteCode::Instruction::kLoadStackDirectN: {
495                 uint8_t count = read<uint8_t>(ip);
496                 ByteCode::Register target = read<ByteCode::Register>(ip);
497                 ByteCode::Pointer src = read<ByteCode::Pointer>(ip);
498                 printf("loadStackDirect%d @%d -> $%d\n", count, src.fAddress, target.fIndex);
499                 break;
500             }
501             DISASSEMBLE_0(kLoopBegin, "loopBegin")
502             DISASSEMBLE_0(kLoopEnd, "loopEnd")
503             DISASSEMBLE_1(kLoopMask, "loopMask")
504             DISASSEMBLE_0(kLoopNext, "loopNext")
505             DISASSEMBLE_0(kMaskNegate, "maskNegate")
506             DISASSEMBLE_0(kMaskPop, "maskPop")
507             DISASSEMBLE_1(kMaskPush, "maskPush")
508             case ByteCode::Instruction::kMatrixMultiply: {
509                 ByteCode::Register target = read<ByteCode::Register>(ip);
510                 ByteCode::Register left = read<ByteCode::Register>(ip);
511                 ByteCode::Register right = read<ByteCode::Register>(ip);
512                 uint8_t leftColsAndRightRows = read<uint8_t>(ip);
513                 uint8_t leftRows = read<uint8_t>(ip);
514                 uint8_t rightColumns = read<uint8_t>(ip);
515                 printf("matrixMultiply $%d, $%d, %d, %d, %d -> $%d\n", left.fIndex, right.fIndex,
516                        leftColsAndRightRows, leftRows, rightColumns, target.fIndex);
517                 break;
518             }
519             case ByteCode::Instruction::kMatrixToMatrix: {
520                 ByteCode::Register target = read<ByteCode::Register>(ip);
521                 ByteCode::Register src = read<ByteCode::Register>(ip);
522                 uint8_t srcColumns = read<uint8_t>(ip);
523                 uint8_t srcRows = read<uint8_t>(ip);
524                 uint8_t dstColumns = read<uint8_t>(ip);
525                 uint8_t dstRows = read<uint8_t>(ip);
526                 printf("matrixToMatrix $%d, %dx%d to %dx%d -> $%d\n", src.fIndex, srcColumns,
527                        srcRows, dstColumns, dstRows, target.fIndex);
528                 break;
529             }
530             DISASSEMBLE_UNARY(kNegateF, "negateF")
531             DISASSEMBLE_UNARY(kNegateS, "negateS")
532             DISASSEMBLE_UNARY(kNot, "not")
533             case ByteCode::Instruction::kReadExternal: {
534                 ByteCode::Register target = read<ByteCode::Register>(ip);
535                 uint8_t count = read<uint8_t>(ip);
536                 uint8_t index = read<uint8_t>(ip);
537                 printf("readExternal %d, %d -> $%d\n", count, index, target.fIndex);
538                 break;
539             }
540             DISASSEMBLE_1(kPrint, "print")
541             DISASSEMBLE_0(kReturn, "return")
542             DISASSEMBLE_1(kReturnValue, "returnValue")
543             case ByteCode::Instruction::kScalarToMatrix: {
544                 ByteCode::Register target = read<ByteCode::Register>(ip);
545                 ByteCode::Register src = read<ByteCode::Register>(ip);
546                 uint8_t columns = read<uint8_t>(ip);
547                 uint8_t rows = read<uint8_t>(ip);
548                 printf("scalarToMatrix $%d, %dx%d -> $%d\n", src.fIndex, columns, rows,
549                        target.fIndex);
550                 break;
551             }
552             case ByteCode::Instruction::kSelect: {
553                 ByteCode::Register target = read<ByteCode::Register>(ip);
554                 ByteCode::Register test = read<ByteCode::Register>(ip);
555                 ByteCode::Register src1 = read<ByteCode::Register>(ip);
556                 ByteCode::Register src2 = read<ByteCode::Register>(ip);
557                 printf("select $%d, $%d, $%d -> %d\n", test.fIndex, src1.fIndex, src2.fIndex,
558                        target.fIndex);
559                 break;
560             }
561             DISASSEMBLE_BINARY(kShiftLeft, "shiftLeft")
562             DISASSEMBLE_BINARY(kShiftRightS, "shiftRightS")
563             DISASSEMBLE_BINARY(kShiftRightU, "shiftRightU")
564             DISASSEMBLE_UNARY(kSignedToFloat, "signedToFloat")
565             DISASSEMBLE_UNARY(kSin, "sin")
566             case ByteCode::Instruction::kSplat: {
567                 uint8_t count = read<uint8_t>(ip);
568                 ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
569                 ByteCode::Register src = read<ByteCode::Register>(ip);
570                 printf("splat%d $%d -> @%d\n", count, src.fIndex, target.fAddress);
571                 break;
572             }
573             DISASSEMBLE_UNARY(kSqrt, "sqrt")
574             DISASSEMBLE_VECTOR_UNARY(kStore, "store")
575             case ByteCode::Instruction::kStoreDirect: {
576                 ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
577                 ByteCode::Register src = read<ByteCode::Register>(ip);
578                 printf("store $%d -> @%d\n", src.fIndex, target.fAddress);
579                 break;
580             }
581             case ByteCode::Instruction::kStoreDirectN: {
582                 uint8_t count = read<uint8_t>(ip);
583                 ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
584                 ByteCode::Register src = read<ByteCode::Register>(ip);
585                 printf("store%d $%d -> @%d\n", count, src.fIndex, target.fAddress);
586                 break;
587             }
588             DISASSEMBLE_VECTOR_UNARY(kStoreParameter, "storeParameter")
589             case ByteCode::Instruction::kStoreParameterDirect: {
590                 ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
591                 ByteCode::Register src = read<ByteCode::Register>(ip);
592                 printf("storeParameterDirect $%d -> &%d\n", src.fIndex, target.fAddress);
593                 break;
594             }
595             case ByteCode::Instruction::kStoreParameterDirectN: {
596                 uint8_t count = read<uint8_t>(ip);
597                 ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
598                 ByteCode::Register src = read<ByteCode::Register>(ip);
599                 printf("storeParameterDirect%d $%d -> &%d\n", count, src.fIndex, target.fAddress);
600                 break;
601             }
602             DISASSEMBLE_VECTOR_UNARY(kStoreStack, "storeStack")
603             case ByteCode::Instruction::kStoreStackDirect: {
604                 ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
605                 ByteCode::Register src = read<ByteCode::Register>(ip);
606                 printf("storeStackDirect $%d -> @%d\n", src.fIndex, target.fAddress);
607                 break;
608             }
609             case ByteCode::Instruction::kStoreStackDirectN: {
610                 uint8_t count = read<uint8_t>(ip);
611                 ByteCode::Pointer target = read<ByteCode::Pointer>(ip);
612                 ByteCode::Register src = read<ByteCode::Register>(ip);
613                 printf("storeStackDirect%d $%d -> @%d\n", count, src.fIndex, target.fAddress);
614                 break;
615             }
616             DISASSEMBLE_UNARY(kTan, "tan")
617             DISASSEMBLE_UNARY(kUnsignedToFloat, "unsignedToFloat")
618             case ByteCode::Instruction::kWriteExternal: {
619                 uint8_t index = read<uint8_t>(ip);
620                 uint8_t count = read<uint8_t>(ip);
621                 ByteCode::Register src = read<ByteCode::Register>(ip);
622                 printf("writeExternal $%d, %d -> %d\n", src.fIndex, count, index);
623                 break;
624             }
625             default:
626                 printf("unsupported: %d\n", (int) inst);
627                 SkASSERT(false);
628         }
629     }
630 
VecMod(Vector x,Vector y)631     static Vector VecMod(Vector x, Vector y) {
632         return Vector(x.fFloat - skvx::trunc(x.fFloat / y.fFloat) * y.fFloat);
633     }
634 
635     #define CHECK_STACK_BOUNDS(address)                              \
636         SkASSERT(context.fStack + address >= fMemory &&              \
637                  context.fStack + address <= fMemory + MEMORY_SIZE)
638 
Inverse2x2(Vector * in,Vector * out)639     static void Inverse2x2(Vector* in, Vector* out) {
640         VectorF a = in[0].fFloat,
641                 b = in[1].fFloat,
642                 c = in[2].fFloat,
643                 d = in[3].fFloat;
644         VectorF idet = VectorF(1) / (a*d - b*c);
645         out[0].fFloat = d * idet;
646         out[1].fFloat = -b * idet;
647         out[2].fFloat = -c * idet;
648         out[3].fFloat = a * idet;
649     }
650 
Inverse3x3(Vector * in,Vector * out)651     static void Inverse3x3(Vector* in, Vector* out) {
652         VectorF a11 = in[0].fFloat, a12 = in[3].fFloat, a13 = in[6].fFloat,
653                 a21 = in[1].fFloat, a22 = in[4].fFloat, a23 = in[7].fFloat,
654                 a31 = in[2].fFloat, a32 = in[5].fFloat, a33 = in[8].fFloat;
655         VectorF idet = VectorF(1) / (a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
656                                      a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31);
657         out[0].fFloat = (a22 * a33 - a23 * a32) * idet;
658         out[1].fFloat = (a23 * a31 - a21 * a33) * idet;
659         out[2].fFloat = (a21 * a32 - a22 * a31) * idet;
660         out[3].fFloat = (a13 * a32 - a12 * a33) * idet;
661         out[4].fFloat = (a11 * a33 - a13 * a31) * idet;
662         out[5].fFloat = (a12 * a31 - a11 * a32) * idet;
663         out[6].fFloat = (a12 * a23 - a13 * a22) * idet;
664         out[7].fFloat = (a13 * a21 - a11 * a23) * idet;
665         out[8].fFloat = (a11 * a22 - a12 * a21) * idet;
666     }
667 
668 
Inverse4x4(Vector * in,Vector * out)669     static void Inverse4x4(Vector* in, Vector* out) {
670         #define inf(index)  in[index].fFloat
671         #define outf(index) out[index].fFloat
672         VectorF a00 = inf(0), a10 = inf(4), a20 = inf( 8), a30 = inf(12),
673                 a01 = inf(1), a11 = inf(5), a21 = inf( 9), a31 = inf(13),
674                 a02 = inf(2), a12 = inf(6), a22 = inf(10), a32 = inf(14),
675                 a03 = inf(3), a13 = inf(7), a23 = inf(11), a33 = inf(15);
676 
677         VectorF b00 = a00 * a11 - a01 * a10,
678                 b01 = a00 * a12 - a02 * a10,
679                 b02 = a00 * a13 - a03 * a10,
680                 b03 = a01 * a12 - a02 * a11,
681                 b04 = a01 * a13 - a03 * a11,
682                 b05 = a02 * a13 - a03 * a12,
683                 b06 = a20 * a31 - a21 * a30,
684                 b07 = a20 * a32 - a22 * a30,
685                 b08 = a20 * a33 - a23 * a30,
686                 b09 = a21 * a32 - a22 * a31,
687                 b10 = a21 * a33 - a23 * a31,
688                 b11 = a22 * a33 - a23 * a32;
689 
690         VectorF idet = VectorF(1) /
691                             (b00 * b11 - b01 * b10 + b02 * b09 + b03 * b08 - b04 * b07 + b05 * b06);
692 
693         b00 *= idet;
694         b01 *= idet;
695         b02 *= idet;
696         b03 *= idet;
697         b04 *= idet;
698         b05 *= idet;
699         b06 *= idet;
700         b07 *= idet;
701         b08 *= idet;
702         b09 *= idet;
703         b10 *= idet;
704         b11 *= idet;
705 
706         outf( 0) = a11 * b11 - a12 * b10 + a13 * b09;
707         outf( 1) = a02 * b10 - a01 * b11 - a03 * b09;
708         outf( 2) = a31 * b05 - a32 * b04 + a33 * b03;
709         outf( 3) = a22 * b04 - a21 * b05 - a23 * b03;
710         outf( 4) = a12 * b08 - a10 * b11 - a13 * b07;
711         outf( 5) = a00 * b11 - a02 * b08 + a03 * b07;
712         outf( 6) = a32 * b02 - a30 * b05 - a33 * b01;
713         outf( 7) = a20 * b05 - a22 * b02 + a23 * b01;
714         outf( 8) = a10 * b10 - a11 * b08 + a13 * b06;
715         outf( 9) = a01 * b08 - a00 * b10 - a03 * b06;
716         outf(10) = a30 * b04 - a31 * b02 + a33 * b00;
717         outf(11) = a21 * b02 - a20 * b04 - a23 * b00;
718         outf(12) = a11 * b07 - a10 * b09 - a12 * b06;
719         outf(13) = a00 * b09 - a01 * b07 + a02 * b06;
720         outf(14) = a31 * b01 - a30 * b03 - a32 * b00;
721         outf(15) = a20 * b03 - a21 * b01 + a22 * b00;
722         #undef inf
723         #undef outf
724     }
725 
innerRun(const ByteCodeFunction * f,Context context,int baseIndex,Vector ** outResult)726     bool innerRun(const ByteCodeFunction* f, Context context, int baseIndex, Vector** outResult) {
727 #ifdef SKSL_THREADED_CODE
728         static const void* labels[] = {
729             // If you aren't familiar with it, the &&label syntax is the GCC / Clang "labels as
730             // values" extension. If you add anything to this array, be sure to add the
731             // corresponding CHECK_LABEL() assert below.
732             &&kNop,
733             &&kAbort,
734             &&kAddF,
735             &&kAddFN,
736             &&kAddI,
737             &&kAddIN,
738             &&kAnd,
739             &&kBoundsCheck,
740             &&kBranch,
741             &&kBranchIfAllFalse,
742             &&kBreak,
743             &&kCall,
744             &&kCallExternal,
745             &&kCompareEQF,
746             &&kCompareEQI,
747             &&kCompareNEQF,
748             &&kCompareNEQI,
749             &&kCompareGTF,
750             &&kCompareGTS,
751             &&kCompareGTU,
752             &&kCompareGTEQF,
753             &&kCompareGTEQS,
754             &&kCompareGTEQU,
755             &&kCompareLTF,
756             &&kCompareLTS,
757             &&kCompareLTU,
758             &&kCompareLTEQF,
759             &&kCompareLTEQS,
760             &&kCompareLTEQU,
761             &&kContinue,
762             &&kCopy,
763             &&kCos,
764             &&kDivideF,
765             &&kDivideFN,
766             &&kDivideS,
767             &&kDivideSN,
768             &&kDivideU,
769             &&kDivideUN,
770             &&kFloatToSigned,
771             &&kFloatToUnsigned,
772             &&kImmediate,
773             &&kInverse2x2,
774             &&kInverse3x3,
775             &&kInverse4x4,
776             &&kLoad,
777             &&kLoadN,
778             &&kLoadDirect,
779             &&kLoadDirectN,
780             &&kLoadParameter,
781             &&kLoadParameterN,
782             &&kLoadParameterDirect,
783             &&kLoadParameterDirectN,
784             &&kLoadStack,
785             &&kLoadStackN,
786             &&kLoadStackDirect,
787             &&kLoadStackDirectN,
788             &&kLoopBegin,
789             &&kLoopEnd,
790             &&kLoopMask,
791             &&kLoopNext,
792             &&kMaskNegate,
793             &&kMaskPop,
794             &&kMaskPush,
795             &&kMatrixMultiply,
796             &&kMatrixToMatrix,
797             &&kMultiplyF,
798             &&kMultiplyFN,
799             &&kMultiplyI,
800             &&kMultiplyIN,
801             &&kNegateF,
802             &&kNegateS,
803             &&kNot,
804             &&kOr,
805             &&kPrint,
806             &&kReadExternal,
807             &&kRemainderF,
808             &&kRemainderFN,
809             &&kRemainderS,
810             &&kRemainderSN,
811             &&kRemainderU,
812             &&kRemainderUN,
813             &&kReturn,
814             &&kReturnValue,
815             &&kScalarToMatrix,
816             &&kSelect,
817             &&kShiftLeft,
818             &&kShiftRightS,
819             &&kShiftRightU,
820             &&kSignedToFloat,
821             &&kSin,
822             &&kSplat,
823             &&kSqrt,
824             &&kStore,
825             &&kStoreN,
826             &&kStoreDirect,
827             &&kStoreDirectN,
828             &&kStoreParameter,
829             &&kStoreParameterN,
830             &&kStoreParameterDirect,
831             &&kStoreParameterDirectN,
832             &&kStoreStack,
833             &&kStoreStackN,
834             &&kStoreStackDirect,
835             &&kStoreStackDirectN,
836             &&kSubtractF,
837             &&kSubtractFN,
838             &&kSubtractI,
839             &&kSubtractIN,
840             &&kTan,
841             &&kUnsignedToFloat,
842             &&kWriteExternal,
843             &&kXor
844         };
845         CHECK_LABEL(kNop);
846         CHECK_LABEL(kAbort);
847         CHECK_LABEL(kAddF);
848         CHECK_LABEL(kAddI);
849         CHECK_LABEL(kAnd);
850         CHECK_LABEL(kBoundsCheck);
851         CHECK_LABEL(kBranch);
852         CHECK_LABEL(kBranchIfAllFalse);
853         CHECK_LABEL(kBreak);
854         CHECK_LABEL(kCall);
855         CHECK_LABEL(kCallExternal);
856         CHECK_LABEL(kCompareEQF);
857         CHECK_LABEL(kCompareEQI);
858         CHECK_LABEL(kCompareNEQF);
859         CHECK_LABEL(kCompareNEQI);
860         CHECK_LABEL(kCompareGTF);
861         CHECK_LABEL(kCompareGTS);
862         CHECK_LABEL(kCompareGTU);
863         CHECK_LABEL(kCompareGTEQF);
864         CHECK_LABEL(kCompareGTEQS);
865         CHECK_LABEL(kCompareGTEQU);
866         CHECK_LABEL(kCompareLTF);
867         CHECK_LABEL(kCompareLTS);
868         CHECK_LABEL(kCompareLTU);
869         CHECK_LABEL(kCompareLTEQF);
870         CHECK_LABEL(kCompareLTEQS);
871         CHECK_LABEL(kCompareLTEQU);
872         CHECK_LABEL(kContinue);
873         CHECK_LABEL(kCopy);
874         CHECK_LABEL(kCos);
875         CHECK_LABEL(kDivideF);
876         CHECK_LABEL(kDivideFN);
877         CHECK_LABEL(kDivideS);
878         CHECK_LABEL(kDivideSN);
879         CHECK_LABEL(kDivideU);
880         CHECK_LABEL(kDivideUN);
881         CHECK_LABEL(kFloatToSigned);
882         CHECK_LABEL(kFloatToUnsigned);
883         CHECK_LABEL(kImmediate);
884         CHECK_LABEL(kInverse2x2);
885         CHECK_LABEL(kInverse3x3);
886         CHECK_LABEL(kInverse4x4);
887         CHECK_LABEL(kLoad);
888         CHECK_LABEL(kLoadN);
889         CHECK_LABEL(kLoadDirect);
890         CHECK_LABEL(kLoadDirectN);
891         CHECK_LABEL(kLoadParameter);
892         CHECK_LABEL(kLoadParameterN);
893         CHECK_LABEL(kLoadParameterDirect);
894         CHECK_LABEL(kLoadParameterDirectN);
895         CHECK_LABEL(kLoadStack);
896         CHECK_LABEL(kLoadStackN);
897         CHECK_LABEL(kLoadStackDirect);
898         CHECK_LABEL(kLoadStackDirectN);
899         CHECK_LABEL(kLoopBegin);
900         CHECK_LABEL(kLoopEnd);
901         CHECK_LABEL(kLoopMask);
902         CHECK_LABEL(kLoopNext);
903         CHECK_LABEL(kMaskNegate);
904         CHECK_LABEL(kMaskPop);
905         CHECK_LABEL(kMaskPush);
906         CHECK_LABEL(kMatrixMultiply);
907         CHECK_LABEL(kMatrixToMatrix);
908         CHECK_LABEL(kMultiplyF);
909         CHECK_LABEL(kMultiplyFN);
910         CHECK_LABEL(kMultiplyI);
911         CHECK_LABEL(kMultiplyIN);
912         CHECK_LABEL(kNegateF);
913         CHECK_LABEL(kNegateS);
914         CHECK_LABEL(kNot);
915         CHECK_LABEL(kOr);
916         CHECK_LABEL(kPrint);
917         CHECK_LABEL(kReadExternal);
918         CHECK_LABEL(kRemainderF);
919         CHECK_LABEL(kRemainderFN);
920         CHECK_LABEL(kRemainderS);
921         CHECK_LABEL(kRemainderSN);
922         CHECK_LABEL(kRemainderU);
923         CHECK_LABEL(kRemainderUN);
924         CHECK_LABEL(kReturn);
925         CHECK_LABEL(kReturnValue);
926         CHECK_LABEL(kScalarToMatrix);
927         CHECK_LABEL(kSelect);
928         CHECK_LABEL(kShiftLeft);
929         CHECK_LABEL(kShiftRightS);
930         CHECK_LABEL(kShiftRightU);
931         CHECK_LABEL(kSignedToFloat);
932         CHECK_LABEL(kSin);
933         CHECK_LABEL(kSplat);
934         CHECK_LABEL(kSqrt);
935         CHECK_LABEL(kStore);
936         CHECK_LABEL(kStoreN);
937         CHECK_LABEL(kStoreDirect);
938         CHECK_LABEL(kStoreDirectN);
939         CHECK_LABEL(kStoreParameter);
940         CHECK_LABEL(kStoreParameterN);
941         CHECK_LABEL(kStoreParameterDirect);
942         CHECK_LABEL(kStoreParameterDirectN);
943         CHECK_LABEL(kStoreStack);
944         CHECK_LABEL(kStoreStackN);
945         CHECK_LABEL(kStoreStackDirect);
946         CHECK_LABEL(kStoreStackDirectN);
947         CHECK_LABEL(kSubtractF);
948         CHECK_LABEL(kSubtractFN);
949         CHECK_LABEL(kSubtractI);
950         CHECK_LABEL(kSubtractIN);
951         CHECK_LABEL(kTan);
952         CHECK_LABEL(kUnsignedToFloat);
953         CHECK_LABEL(kWriteExternal);
954         CHECK_LABEL(kXor);
955 #endif
956         auto mask = [&]() { return *context.fMaskStack & *context.fLoopStack; };
957         auto parameterBase = [&]() {
958             return context.fCallStack.empty() ? context.fStack
959                                               : context.fCallStack.top().fParameters;
960         };
961         const uint8_t* code = f->fCode.data();
962         const uint8_t* ip = code;
963 #ifdef SKSL_THREADED_CODE
964         #ifdef TRACE
965             const uint8_t* trace_ip = ip;
966             printf("0: ");
967             disassemble(&trace_ip);
968         #endif
969         goto *labels[(int) read<ByteCode::Instruction>(&ip)];
970 #else
971         for (;;) {
972             #ifdef TRACE
973                 const uint8_t* trace_ip = ip;
974                 disassemble(&trace_ip);
975             #endif
976             ByteCode::Instruction inst = read<ByteCode::Instruction>(&ip);
977             switch (inst) {
978 #endif
979                 VECTOR_BINARY_OP(kAddF, fFloat, fFloat, +)
980                 VECTOR_BINARY_OP(kAddI, fInt, fInt, +)
981                 BINARY_OP(kAnd, fInt, fInt, &)
982                 BINARY_OP(kCompareEQF, fFloat, fInt, ==)
983                 BINARY_OP(kCompareEQI, fInt, fInt, ==)
984                 BINARY_OP(kCompareNEQF, fFloat, fInt, !=)
985                 BINARY_OP(kCompareNEQI, fInt, fInt, !=)
986                 BINARY_OP(kCompareGTF, fFloat, fInt, >)
987                 BINARY_OP(kCompareGTS, fInt, fInt, >)
988                 BINARY_OP(kCompareGTU, fUInt, fUInt, >)
989                 BINARY_OP(kCompareGTEQF, fFloat, fInt, >=)
990                 BINARY_OP(kCompareGTEQS, fInt, fInt, >=)
991                 BINARY_OP(kCompareGTEQU, fUInt, fUInt, >=)
992                 BINARY_OP(kCompareLTF, fFloat, fInt, <)
993                 BINARY_OP(kCompareLTS, fInt, fInt, <)
994                 BINARY_OP(kCompareLTU, fUInt, fUInt, <)
995                 BINARY_OP(kCompareLTEQF, fFloat, fInt, <=)
996                 BINARY_OP(kCompareLTEQS, fInt, fInt, <=)
997                 BINARY_OP(kCompareLTEQU, fUInt, fUInt, <=)
998                 VECTOR_BINARY_OP(kSubtractF, fFloat, fFloat, -)
999                 VECTOR_BINARY_OP(kSubtractI, fInt, fInt, -)
1000                 VECTOR_BINARY_OP(kDivideF, fFloat, fFloat, /)
1001                 MASKED_VECTOR_BINARY_OP(kDivideS, fInt, fInt, /)
1002                 MASKED_VECTOR_BINARY_OP(kDivideU, fUInt, fUInt, /)
1003                 MASKED_VECTOR_BINARY_OP(kRemainderS, fInt, fInt, %)
1004                 MASKED_VECTOR_BINARY_OP(kRemainderU, fUInt, fUInt, %)
1005                 VECTOR_BINARY_OP(kMultiplyF, fFloat, fFloat, *)
1006                 VECTOR_BINARY_OP(kMultiplyI, fInt, fInt, *)
1007                 BINARY_OP(kOr, fInt, fInt, |)
1008                 BINARY_OP(kXor, fInt, fInt, ^)
1009                 LABEL(kAbort)
1010                     SkASSERT(false);
1011                     return false;
1012                 LABEL(kBoundsCheck) {
1013                     ByteCode::Register r = read<ByteCode::Register>(&ip);
1014                     int length = read<int>(&ip);
1015                     if (skvx::any(mask() & ((fRegisters[r.fIndex].fInt < 0) |
1016                                             (fRegisters[r.fIndex].fInt >= length)))) {
1017                         return false;
1018                     }
1019                     NEXT();
1020                 }
1021                 LABEL(kBranch) {
1022                     ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
1023                     ip = code + target.fAddress;
1024                     NEXT();
1025                 }
1026                 LABEL(kBranchIfAllFalse) {
1027                     ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
1028                     if (!skvx::any(mask())) {
1029                         ip = code + target.fAddress;
1030                     }
1031                     NEXT();
1032                 }
1033                 LABEL(kBreak)
1034                     *context.fLoopStack &= ~mask();
1035                     NEXT();
1036                 LABEL(kCall) {
1037                     ByteCode::Register returnValue = read<ByteCode::Register>(&ip);
1038                     uint8_t idx = read<uint8_t>(&ip);
1039                     ByteCode::Register args = read<ByteCode::Register>(&ip);
1040                     const ByteCodeFunction* target = fCode->fFunctions[idx].get();
1041                     int stackSlotCount = target->fStackSlotCount + target->fParameterSlotCount;
1042                     context.fCallStack.push(StackFrame(f, ip, stackSlotCount,
1043                                                        &fRegisters[args.fIndex],
1044                                                        &fRegisters[returnValue.fIndex]));
1045                     f = target;
1046                     code = f->fCode.data();
1047                     ip = code;
1048                     context.fStack -= stackSlotCount;
1049                     memcpy(context.fStack, &fRegisters[args.fIndex],
1050                            f->fParameterSlotCount * sizeof(Vector));
1051                     NEXT();
1052                 }
1053                 LABEL(kCallExternal) {
1054                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1055                     uint8_t index = read<uint8_t>(&ip);
1056                     uint8_t targetSize = read<uint8_t>(&ip);
1057                     ByteCode::Register arguments = read<ByteCode::Register>(&ip);
1058                     uint8_t argumentSize = read<uint8_t>(&ip);
1059                     ExternalValue* v = fCode->fExternalValues[index];
1060                     float tmpReturn[64];
1061                     SkASSERT(targetSize < 64);
1062                     float tmpArgs[64];
1063                     SkASSERT(argumentSize < 64);
1064                     VectorI m = mask();
1065                     for (int i = 0; i < width; ++i) {
1066                         if (m[i]) {
1067                             for (int j = 0; j < argumentSize; j++) {
1068                                 tmpArgs[j] = fRegisters[arguments.fIndex + j].fFloat[i];
1069                             }
1070                             v->call(baseIndex + i, tmpArgs, tmpReturn);
1071                             for (int j = 0; j < targetSize; j++) {
1072                                 fRegisters[target.fIndex + j].fFloat[i] = tmpReturn[j];
1073                             }
1074                         }
1075                     }
1076                     NEXT();
1077                 }
1078                 LABEL(kContinue) {
1079                     VectorI m = mask();
1080                     *context.fContinueStack |= m;
1081                     *context.fLoopStack &= ~m;
1082                     NEXT();
1083                 }
1084                 LABEL(kCopy) {
1085                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1086                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1087                     fRegisters[target.fIndex].fInt = fRegisters[src.fIndex].fInt;
1088                     NEXT();
1089                 }
1090                 VECTOR_UNARY_FN(kCos, cosf)
1091                 LABEL(kFloatToSigned) {
1092                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1093                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1094                     fRegisters[target.fIndex] = Vector(skvx::cast<int32_t>(
1095                                                        fRegisters[src.fIndex].fFloat));
1096                     NEXT();
1097                 }
1098                 LABEL(kFloatToUnsigned) {
1099                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1100                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1101                     fRegisters[target.fIndex] = Vector(skvx::cast<uint32_t>(
1102                                                        fRegisters[src.fIndex].fFloat));
1103                     NEXT();
1104                 }
1105                 LABEL(kImmediate) {
1106                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1107                     ByteCode::Immediate src = read<ByteCode::Immediate>(&ip);
1108                     fRegisters[target.fIndex].fInt = src.fInt;
1109                     NEXT();
1110                 }
1111                 LABEL(kInverse2x2) {
1112                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1113                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1114                     Inverse2x2(&fRegisters[src.fIndex], &fRegisters[target.fIndex]);
1115                     NEXT();
1116                 }
1117                 LABEL(kInverse3x3) {
1118                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1119                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1120                     Inverse3x3(&fRegisters[src.fIndex], &fRegisters[target.fIndex]);
1121                     NEXT();
1122                 }
1123                 LABEL(kInverse4x4) {
1124                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1125                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1126                     Inverse4x4(&fRegisters[src.fIndex], &fRegisters[target.fIndex]);
1127                     NEXT();
1128                 }
1129                 LABEL(kLoad) {
1130                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1131                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1132                     VectorI m = mask();
1133                     for (int i = 0; i < width; ++i) {
1134                         if (m[i]) {
1135                             fRegisters[target.fIndex].fInt[i] =
1136                                                     fMemory[fRegisters[src.fIndex].fInt[i]].fInt[i];
1137                         }
1138                     }
1139                     NEXT();
1140                 }
1141                 LABEL(kLoadN) {
1142                     uint8_t count = read<uint8_t>(&ip);
1143                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1144                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1145                     VectorI m = mask();
1146                     for (int i = 0; i < width; ++i) {
1147                         if (m[i]) {
1148                             for (int j = 0; j < count; ++j) {
1149                                 fRegisters[target.fIndex + j].fInt[i] =
1150                                                 fMemory[fRegisters[src.fIndex].fInt[i] + j].fInt[i];
1151                             }
1152                         }
1153                     }
1154                     NEXT();
1155                 }
1156                 LABEL(kLoadDirect) {
1157                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1158                     ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
1159                     fRegisters[target.fIndex].fInt = fMemory[src.fAddress].fInt;
1160                     NEXT();
1161                 }
1162                 LABEL(kLoadDirectN) {
1163                     uint8_t count = read<uint8_t>(&ip);
1164                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1165                     ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
1166                     for (int i = 0; i < count; ++i) {
1167                         fRegisters[target.fIndex + i].fInt = fMemory[src.fAddress + i].fInt;
1168                     }
1169                     NEXT();
1170                 }
1171                 LABEL(kLoadParameter) {
1172                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1173                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1174                     Vector* base = parameterBase();
1175                     VectorI m = mask();
1176                     for (int i = 0; i < width; ++i) {
1177                         if (m[i]) {
1178                             fRegisters[target.fIndex].fInt[i] =
1179                                                        base[fRegisters[src.fIndex].fInt[i]].fInt[i];
1180                         }
1181                     }
1182                     NEXT();
1183                 }
1184                 LABEL(kLoadParameterN) {
1185                     uint8_t count = read<uint8_t>(&ip);
1186                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1187                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1188                     Vector* base = parameterBase();
1189                     VectorI m = mask();
1190                     for (int i = 0; i < width; ++i) {
1191                         if (m[i]) {
1192                             for (int j = 0; j < count; ++j) {
1193                                 fRegisters[target.fIndex + j].fInt[i] =
1194                                                    base[fRegisters[src.fIndex].fInt[i] + j].fInt[i];
1195                             }
1196                         }
1197                     }
1198                     NEXT();
1199                 }
1200                 LABEL(kLoadParameterDirect) {
1201                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1202                     ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
1203                     Vector* base = parameterBase();
1204                     fRegisters[target.fIndex].fInt = base[src.fAddress].fInt;
1205                     NEXT();
1206                 }
1207                 LABEL(kLoadParameterDirectN) {
1208                     uint8_t count = read<uint8_t>(&ip);
1209                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1210                     ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
1211                     Vector* base = parameterBase();
1212                     for (int i = 0; i < count; ++i) {
1213                         fRegisters[target.fIndex + i].fInt = base[src.fAddress + i].fInt;
1214                     }
1215                     NEXT();
1216                 }
1217                 LABEL(kLoadStack) {
1218                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1219                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1220                     VectorI m = mask();
1221                     for (int i = 0; i < width; ++i) {
1222                         if (m[i]) {
1223                             fRegisters[target.fIndex].fInt[i] =
1224                                              context.fStack[fRegisters[src.fIndex].fInt[i]].fInt[i];
1225                         }
1226                     }
1227                     NEXT();
1228                 }
1229                 LABEL(kLoadStackN) {
1230                     uint8_t count = read<uint8_t>(&ip);
1231                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1232                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1233                     VectorI m = mask();
1234                     for (int i = 0; i < width; ++i) {
1235                         if (m[i]) {
1236                             for (int j = 0; j < count; ++j) {
1237                                 fRegisters[target.fIndex + j].fInt[i] =
1238                                          context.fStack[fRegisters[src.fIndex].fInt[i] + j].fInt[i];
1239                             }
1240                         }
1241                     }
1242                     NEXT();
1243                 }
1244                 LABEL(kLoadStackDirect) {
1245                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1246                     ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
1247                     CHECK_STACK_BOUNDS(src.fAddress);
1248                     fRegisters[target.fIndex].fInt = context.fStack[src.fAddress].fInt;
1249                     NEXT();
1250                 }
1251                 LABEL(kLoadStackDirectN) {
1252                     uint8_t count = read<uint8_t>(&ip);
1253                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1254                     ByteCode::Pointer src = read<ByteCode::Pointer>(&ip);
1255                     CHECK_STACK_BOUNDS(src.fAddress);
1256                     for (int i = 0; i < count; ++i) {
1257                         fRegisters[target.fIndex + i].fInt = context.fStack[src.fAddress + i].fInt;
1258                     }
1259                     NEXT();
1260                 }
1261                 LABEL(kLoopBegin) {
1262                     context.fLoopStack[1] = context.fLoopStack[0];
1263                     ++context.fLoopStack;
1264                     context.fContinueStack[1] = 0;
1265                     ++context.fContinueStack;
1266                     NEXT();
1267                 }
1268                 LABEL(kLoopEnd) {
1269                     --context.fLoopStack;
1270                     --context.fContinueStack;
1271                     NEXT();
1272                 }
1273                 LABEL(kLoopMask) {
1274                     ByteCode::Register value = read<ByteCode::Register>(&ip);
1275                     *context.fLoopStack &= fRegisters[value.fIndex].fInt;
1276                     NEXT();
1277                 }
1278                 LABEL(kLoopNext) {
1279                     *context.fLoopStack |= *context.fContinueStack;
1280                     *context.fContinueStack = 0;
1281                     NEXT();
1282                 }
1283                 LABEL(kMaskNegate) {
1284                     *context.fMaskStack = context.fMaskStack[-1] & ~context.fCondStack[0];
1285                     NEXT();
1286                 }
1287                 LABEL(kMaskPop) {
1288                     --context.fMaskStack;
1289                     --context.fCondStack;
1290                     NEXT();
1291                 }
1292                 LABEL(kMaskPush) {
1293                     ByteCode::Register value = read<ByteCode::Register>(&ip);
1294                     context.fCondStack[1] = fRegisters[value.fIndex].fInt;
1295                     context.fMaskStack[1] = context.fMaskStack[0] & context.fCondStack[1];
1296                     ++context.fCondStack;
1297                     ++context.fMaskStack;
1298                     NEXT();
1299                 }
1300                 LABEL(kMatrixMultiply) {
1301                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1302                     ByteCode::Register left = read<ByteCode::Register>(&ip);
1303                     ByteCode::Register right = read<ByteCode::Register>(&ip);
1304                     uint8_t lCols = read<uint8_t>(&ip);
1305                     uint8_t lRows = read<uint8_t>(&ip);
1306                     uint8_t rCols = read<uint8_t>(&ip);
1307                     uint8_t rRows = lCols;
1308                     memset(&fRegisters[target.fIndex], 0, sizeof(Vector) * rCols * lRows);
1309                     for (int c = 0; c < rCols; ++c) {
1310                         for (int r = 0; r < lRows; ++r) {
1311                             for (int j = 0; j < lCols; ++j) {
1312                                 fRegisters[target.fIndex + c * lRows + r].fFloat +=
1313                                         fRegisters[left.fIndex + j * lRows + r].fFloat *
1314                                         fRegisters[right.fIndex + c * rRows + j].fFloat;
1315                             }
1316                         }
1317                     }
1318                     NEXT();
1319                 }
1320                 LABEL(kMatrixToMatrix) {
1321                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1322                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1323                     uint8_t srcColumns = read<uint8_t>(&ip);
1324                     uint8_t srcRows = read<uint8_t>(&ip);
1325                     uint8_t dstColumns = read<uint8_t>(&ip);
1326                     uint8_t dstRows = read<uint8_t>(&ip);
1327                     int offset = 0;
1328                     for (int i = 0; i < dstColumns; ++i) {
1329                         for (int j = 0; j < dstRows; ++j) {
1330                             if (i < srcColumns && j < srcRows) {
1331                                 fRegisters[target.fIndex + offset] =
1332                                                          fRegisters[src.fIndex + (srcRows * i) + j];
1333                             } else {
1334                                 if (i == j) {
1335                                     fRegisters[target.fIndex + offset].fFloat = 1;
1336                                 } else {
1337                                     fRegisters[target.fIndex + offset].fFloat = 0;
1338                                 }
1339                             }
1340                             ++offset;
1341                         }
1342                     }
1343                     NEXT();
1344                 }
1345                 LABEL(kNegateF) {
1346                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1347                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1348                     fRegisters[target.fIndex].fFloat = -fRegisters[src.fIndex].fFloat;
1349                     NEXT();
1350                 }
1351                 LABEL(kNegateS) {
1352                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1353                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1354                     fRegisters[target.fIndex].fInt = -fRegisters[src.fIndex].fInt;
1355                     NEXT();
1356                 }
1357                 LABEL(kNop)
1358                     NEXT();
1359                 LABEL(kNot) {
1360                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1361                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1362                     fRegisters[target.fIndex].fInt = ~fRegisters[src.fIndex].fInt;
1363                     NEXT();
1364                 }
1365                 LABEL(kPrint) {
1366                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1367                     if (skvx::any(mask())) {
1368                         printf("[");
1369                         const char* separator = "";
1370                         for (int i = 0; i < width; ++i) {
1371                             if (mask()[i]) {
1372                                 printf("%s%f", separator, fRegisters[src.fIndex].fFloat[i]);
1373                             }
1374                             else {
1375                                 printf("%s-", separator);
1376                             }
1377                             separator = ", ";
1378                         }
1379                         printf("]\n");
1380                     }
1381                     NEXT();
1382                 }
1383                 LABEL(kReadExternal) {
1384                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1385                     uint8_t count = read<uint8_t>(&ip);
1386                     uint8_t index = read<uint8_t>(&ip);
1387                     SkASSERT(count <= 4);
1388                     SkASSERT(fCode->fExternalValues.size() > index);
1389                     float tmp[4];
1390                     VectorI m = mask();
1391                     for (int i = 0; i < width; ++i) {
1392                         if (m[i]) {
1393                             fCode->fExternalValues[index]->read(baseIndex + i, tmp);
1394                             for (int j = 0; j < count; ++j) {
1395                                 fRegisters[target.fIndex + j].fFloat[i] = tmp[j];
1396                             }
1397                         }
1398                     }
1399                     NEXT();
1400                 }
1401                 LABEL(kRemainderF) {
1402                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1403                     ByteCode::Register src1 = read<ByteCode::Register>(&ip);
1404                     ByteCode::Register src2 = read<ByteCode::Register>(&ip);
1405                     fRegisters[target.fIndex] = VecMod(fRegisters[src1.fIndex],
1406                                                        fRegisters[src2.fIndex]);
1407                     NEXT();
1408                 }
1409                 LABEL(kRemainderFN) {
1410                     uint8_t count = read<uint8_t>(&ip);
1411                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1412                     ByteCode::Register src1 = read<ByteCode::Register>(&ip);
1413                     ByteCode::Register src2 = read<ByteCode::Register>(&ip);
1414                     for (int i = 0; i < count; ++i) {
1415                         fRegisters[target.fIndex + i] = VecMod(fRegisters[src1.fIndex + i],
1416                                                                fRegisters[src2.fIndex + i]);
1417                     }
1418                     NEXT();
1419                 }
1420                 LABEL(kReturn) {
1421                     if (context.fCallStack.empty()) {
1422                         return true;
1423                     }
1424                     StackFrame frame = context.fCallStack.top();
1425                     f = frame.fFunction;
1426                     code = f->fCode.data();
1427                     ip = frame.fIP;
1428                     context.fStack += frame.fStackSlotCount;
1429                     context.fCallStack.pop();
1430                     NEXT();
1431                 }
1432                 LABEL(kReturnValue) {
1433                     ByteCode::Register returnValue = read<ByteCode::Register>(&ip);
1434                     if (context.fCallStack.empty()) {
1435                         if (outResult) {
1436                             *outResult = &fRegisters[returnValue.fIndex];
1437                         }
1438                         return true;
1439                     }
1440                     StackFrame frame = context.fCallStack.top();
1441                     ip = frame.fIP;
1442                     context.fStack += frame.fStackSlotCount;
1443                     memcpy(frame.fReturnValue, &fRegisters[returnValue.fIndex],
1444                            sizeof(Vector) * f->fReturnSlotCount);
1445                     f = frame.fFunction;
1446                     code = f->fCode.data();
1447                     context.fCallStack.pop();
1448                     NEXT();
1449                 }
1450                 LABEL(kScalarToMatrix) {
1451                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1452                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1453                     uint8_t columns = read<uint8_t>(&ip);
1454                     uint8_t rows = read<uint8_t>(&ip);
1455                     int offset = 0;
1456                     for (int i = 0; i < columns; ++i) {
1457                         for (int j = 0; j < rows; ++j) {
1458                             if (i == j) {
1459                                 fRegisters[target.fIndex + offset] = fRegisters[src.fIndex];
1460                             } else {
1461                                 fRegisters[target.fIndex + offset].fFloat = 0;
1462                             }
1463                             ++offset;
1464                         }
1465                     }
1466                     NEXT();
1467                 }
1468                 LABEL(kSelect) {
1469                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1470                     ByteCode::Register test = read<ByteCode::Register>(&ip);
1471                     ByteCode::Register src1 = read<ByteCode::Register>(&ip);
1472                     ByteCode::Register src2 = read<ByteCode::Register>(&ip);
1473                     fRegisters[target.fIndex] = skvx::if_then_else(fRegisters[test.fIndex].fInt,
1474                                                                    fRegisters[src1.fIndex].fFloat,
1475                                                                    fRegisters[src2.fIndex].fFloat);
1476                     NEXT();
1477                 }
1478                 LABEL(kShiftLeft) {
1479                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1480                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1481                     uint8_t count = read<uint8_t>(&ip);
1482                     fRegisters[target.fIndex].fInt = fRegisters[src.fIndex].fInt << count;
1483                     NEXT();
1484                 }
1485                 LABEL(kShiftRightS) {
1486                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1487                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1488                     int8_t count = read<int8_t>(&ip);
1489                     fRegisters[target.fIndex].fInt = fRegisters[src.fIndex].fInt >> count;
1490                     NEXT();
1491                 }
1492                 LABEL(kShiftRightU) {
1493                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1494                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1495                     uint8_t count = read<uint8_t>(&ip);
1496                     fRegisters[target.fIndex].fUInt = fRegisters[src.fIndex].fUInt >> count;
1497                     NEXT();
1498                 }
1499                 LABEL(kSignedToFloat) {
1500                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1501                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1502                     fRegisters[target.fIndex] = Vector(skvx::cast<float>(
1503                                                                       fRegisters[src.fIndex].fInt));
1504                     NEXT();
1505                 }
1506                 VECTOR_UNARY_FN(kSin, sinf)
1507                 LABEL(kSplat) {
1508                     uint8_t count = read<uint8_t>(&ip);
1509                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1510                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1511                     for (int i = 0; i < count; ++i) {
1512                         fRegisters[target.fIndex + i] = fRegisters[src.fIndex];
1513                     }
1514                     NEXT();
1515                 }
1516                 LABEL(kSqrt) {
1517                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1518                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1519                     fRegisters[target.fIndex].fFloat = skvx::sqrt(fRegisters[src.fIndex].fFloat);
1520                     NEXT();
1521                 }
1522                 LABEL(kStore) {
1523                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1524                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1525                     VectorI m = mask();
1526                     for (int i = 0; i < width; ++i) {
1527                         if (m[i]) {
1528                             fMemory[fRegisters[target.fIndex].fInt[i]].fInt[i] =
1529                                                                      fRegisters[src.fIndex].fInt[i];
1530                         }
1531                     }
1532                     NEXT();
1533                 }
1534                 LABEL(kStoreN) {
1535                     uint8_t count = read<uint8_t>(&ip);
1536                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1537                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1538                     VectorI m = mask();
1539                     for (int i = 0; i < width; ++i) {
1540                         if (m[i]) {
1541                             for (int j = 0; j < count; ++j) {
1542                                 fMemory[fRegisters[target.fIndex].fInt[i] + j].fInt[i] =
1543                                                                  fRegisters[src.fIndex + j].fInt[i];
1544                             }
1545                         }
1546                     }
1547                     NEXT();
1548                 }
1549                 LABEL(kStoreDirect) {
1550                     ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
1551                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1552                     fMemory[target.fAddress] = skvx::if_then_else(mask(),
1553                                                                   fRegisters[src.fIndex].fFloat,
1554                                                                   fMemory[target.fAddress].fFloat);
1555                     NEXT();
1556                 }
1557                 LABEL(kStoreDirectN) {
1558                     uint8_t count = read<uint8_t>(&ip);
1559                     ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
1560                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1561                     for (int i = 0; i < count; ++i) {
1562                         fMemory[target.fAddress + i] = skvx::if_then_else(
1563                                                                mask(),
1564                                                                fRegisters[src.fIndex + i].fFloat,
1565                                                                fMemory[target.fAddress + i].fFloat);
1566                     }
1567                     NEXT();
1568                 }
1569                 LABEL(kStoreParameter) {
1570                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1571                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1572                     Vector* base = parameterBase();
1573                     VectorI m = mask();
1574                     for (int i = 0; i < width; ++i) {
1575                         if (m[i]) {
1576                             base[fRegisters[target.fIndex].fInt[i]].fInt[i] =
1577                                                                      fRegisters[src.fIndex].fInt[i];
1578                         }
1579                     }
1580                     NEXT();
1581                 }
1582                 LABEL(kStoreParameterN) {
1583                     uint8_t count = read<uint8_t>(&ip);
1584                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1585                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1586                     Vector* base = parameterBase();
1587                     VectorI m = mask();
1588                     for (int i = 0; i < width; ++i) {
1589                         if (m[i]) {
1590                             for (int j = 0; j < count; ++j) {
1591                                 base[fRegisters[target.fIndex].fInt[i] + j].fInt[i] =
1592                                                                  fRegisters[src.fIndex + j].fInt[i];
1593                             }
1594                         }
1595                     }
1596                     NEXT();
1597                 }
1598                 LABEL(kStoreParameterDirect) {
1599                     ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
1600                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1601                     Vector* base = parameterBase();
1602                     base[target.fAddress].fFloat = skvx::if_then_else(mask(),
1603                                                                       fRegisters[src.fIndex].fFloat,
1604                                                                       base[target.fAddress].fFloat);
1605                     NEXT();
1606                 }
1607                 LABEL(kStoreParameterDirectN) {
1608                     uint8_t count = read<uint8_t>(&ip);
1609                     ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
1610                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1611                     Vector* base = parameterBase();
1612                     for (int i = 0; i < count; ++i) {
1613                         base[target.fAddress + i].fFloat = skvx::if_then_else(
1614                                                                   mask(),
1615                                                                   fRegisters[src.fIndex + i].fFloat,
1616                                                                   base[target.fAddress + i].fFloat);
1617                     }
1618                     NEXT();
1619                 }
1620                 LABEL(kStoreStack) {
1621                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1622                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1623                     VectorI m = mask();
1624                     for (int i = 0; i < width; ++i) {
1625                         if (m[i]) {
1626                             context.fStack[fRegisters[target.fIndex].fInt[i]].fInt[i] =
1627                                                                      fRegisters[src.fIndex].fInt[i];
1628                         }
1629                     }
1630                     NEXT();
1631                 }
1632                 LABEL(kStoreStackN) {
1633                     uint8_t count = read<uint8_t>(&ip);
1634                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1635                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1636                     VectorI m = mask();
1637                     for (int i = 0; i < width; ++i) {
1638                         if (m[i]) {
1639                             for (int j = 0; j < count; ++j) {
1640                                 context.fStack[fRegisters[target.fIndex].fInt[i] + j].fInt[i] =
1641                                                                  fRegisters[src.fIndex + j].fInt[i];
1642                             }
1643                         }
1644                     }
1645                     NEXT();
1646                 }
1647                 LABEL(kStoreStackDirect) {
1648                     ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
1649                     CHECK_STACK_BOUNDS(target.fAddress);
1650                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1651                     context.fStack[target.fAddress] = skvx::if_then_else(
1652                                                             mask(),
1653                                                             fRegisters[src.fIndex].fFloat,
1654                                                             context.fStack[target.fAddress].fFloat);
1655                     NEXT();
1656                 }
1657                 LABEL(kStoreStackDirectN) {
1658                     uint8_t count = read<uint8_t>(&ip);
1659                     ByteCode::Pointer target = read<ByteCode::Pointer>(&ip);
1660                     CHECK_STACK_BOUNDS(target.fAddress);
1661                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1662                     for (int i = 0; i < count; ++i) {
1663                         context.fStack[target.fAddress + i] = skvx::if_then_else(
1664                                                         mask(),
1665                                                         fRegisters[src.fIndex + i].fFloat,
1666                                                         context.fStack[target.fAddress + i].fFloat);
1667                     }
1668                     NEXT();
1669                 }
1670                 VECTOR_UNARY_FN(kTan, tanf)
1671                 LABEL(kUnsignedToFloat) {
1672                     ByteCode::Register target = read<ByteCode::Register>(&ip);
1673                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1674                     fRegisters[target.fIndex] = Vector(skvx::cast<float>(
1675                                                                      fRegisters[src.fIndex].fUInt));
1676                     NEXT();
1677                 }
1678                 LABEL(kWriteExternal) {
1679                     uint8_t index = read<uint8_t>(&ip);
1680                     uint8_t count = read<uint8_t>(&ip);
1681                     SkASSERT(count <= 4);
1682                     SkASSERT(fCode->fExternalValues.size() > index);
1683                     ByteCode::Register src = read<ByteCode::Register>(&ip);
1684                     float tmp[4];
1685                     VectorI m = mask();
1686                     for (int i = 0; i < width; ++i) {
1687                         if (m[i]) {
1688                             for (int j = 0; j < count; ++j) {
1689                                 tmp[j] = fRegisters[src.fIndex + j].fFloat[i];
1690                             }
1691                             fCode->fExternalValues[index]->write(baseIndex + i, tmp);
1692                         }
1693                     }
1694                     NEXT();
1695                 }
1696 #ifndef SKSL_THREADED_CODE
1697             }
1698         }
1699 #endif
1700     }
1701 
1702     const std::unique_ptr<ByteCode> fCode;
1703 
1704     void* fBackingStore;
1705 
1706     Vector* fRegisters;
1707 
1708     Vector* fMemory;
1709 
1710     friend class ByteCode;
1711 
1712     friend class ByteCodeGenerator;
1713 };
1714 
1715 #undef BINARY_OP
1716 #undef CHECK_STACK_BOUNDS
1717 
1718 } // namespace
1719 
1720 #endif
1721