1 /* Author(s):
2  *   Connor Abbott
3  *   Alyssa Rosenzweig
4  *
5  * Copyright (c) 2013 Connor Abbott (connor@abbott.cx)
6  * Copyright (c) 2018 Alyssa Rosenzweig (alyssa@rosenzweig.io)
7  * Copyright (C) 2019-2020 Collabora, Ltd.
8  *
9  * Permission is hereby granted, free of charge, to any person obtaining a copy
10  * of this software and associated documentation files (the "Software"), to deal
11  * in the Software without restriction, including without limitation the rights
12  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13  * copies of the Software, and to permit persons to whom the Software is
14  * furnished to do so, subject to the following conditions:
15  *
16  * The above copyright notice and this permission notice shall be included in
17  * all copies or substantial portions of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25  * THE SOFTWARE.
26  */
27 
28 #ifndef __midgard_h__
29 #define __midgard_h__
30 
31 #include <stdint.h>
32 #include <stdbool.h>
33 
34 #define MIDGARD_DBG_MSGS		0x0001
35 #define MIDGARD_DBG_SHADERS		0x0002
36 #define MIDGARD_DBG_SHADERDB            0x0004
37 #define MIDGARD_DBG_INORDER             0x0008
38 #define MIDGARD_DBG_VERBOSE             0x0010
39 #define MIDGARD_DBG_INTERNAL            0x0020
40 
41 extern int midgard_debug;
42 
43 typedef enum {
44         midgard_word_type_alu,
45         midgard_word_type_load_store,
46         midgard_word_type_texture
47 } midgard_word_type;
48 
49 typedef enum {
50         midgard_alu_vmul,
51         midgard_alu_sadd,
52         midgard_alu_smul,
53         midgard_alu_vadd,
54         midgard_alu_lut
55 } midgard_alu;
56 
57 enum {
58         TAG_INVALID = 0x0,
59         TAG_BREAK = 0x1,
60         TAG_TEXTURE_4_VTX = 0x2,
61         TAG_TEXTURE_4 = 0x3,
62         TAG_TEXTURE_4_BARRIER = 0x4,
63         TAG_LOAD_STORE_4 = 0x5,
64         TAG_UNKNOWN_1 = 0x6,
65         TAG_UNKNOWN_2 = 0x7,
66         TAG_ALU_4 = 0x8,
67         TAG_ALU_8 = 0x9,
68         TAG_ALU_12 = 0xA,
69         TAG_ALU_16 = 0xB,
70         TAG_ALU_4_WRITEOUT = 0xC,
71         TAG_ALU_8_WRITEOUT = 0xD,
72         TAG_ALU_12_WRITEOUT = 0xE,
73         TAG_ALU_16_WRITEOUT = 0xF
74 };
75 
76 /*
77  * ALU words
78  */
79 
80 typedef enum {
81         midgard_alu_op_fadd        = 0x10, /* round to even */
82         midgard_alu_op_fadd_rtz    = 0x11,
83         midgard_alu_op_fadd_rtn    = 0x12,
84         midgard_alu_op_fadd_rtp    = 0x13,
85         midgard_alu_op_fmul        = 0x14, /* round to even */
86         midgard_alu_op_fmul_rtz    = 0x15,
87         midgard_alu_op_fmul_rtn    = 0x16,
88         midgard_alu_op_fmul_rtp    = 0x17,
89 
90         midgard_alu_op_fmin        = 0x28, /* if an operand is NaN, propagate the other */
91         midgard_alu_op_fmin_nan    = 0x29, /* if an operand is NaN, propagate it */
92         midgard_alu_op_fabsmin     = 0x2A, /* min(abs(a,b)) */
93         midgard_alu_op_fabsmin_nan = 0x2B, /* min_nan(abs(a,b)) */
94         midgard_alu_op_fmax        = 0x2C, /* if an operand is NaN, propagate the other */
95         midgard_alu_op_fmax_nan    = 0x2D, /* if an operand is NaN, propagate it */
96         midgard_alu_op_fabsmax     = 0x2E, /* max(abs(a,b)) */
97         midgard_alu_op_fabsmax_nan = 0x2F, /* max_nan(abs(a,b)) */
98 
99         midgard_alu_op_fmov        = 0x30, /* fmov_rte */
100         midgard_alu_op_fmov_rtz    = 0x31,
101         midgard_alu_op_fmov_rtn    = 0x32,
102         midgard_alu_op_fmov_rtp    = 0x33,
103         midgard_alu_op_froundeven  = 0x34,
104         midgard_alu_op_ftrunc      = 0x35,
105         midgard_alu_op_ffloor      = 0x36,
106         midgard_alu_op_fceil       = 0x37,
107         midgard_alu_op_ffma        = 0x38, /* rte */
108         midgard_alu_op_ffma_rtz    = 0x39,
109         midgard_alu_op_ffma_rtn    = 0x3A,
110         midgard_alu_op_ffma_rtp    = 0x3B,
111         midgard_alu_op_fdot3       = 0x3C,
112         midgard_alu_op_fdot3r      = 0x3D,
113         midgard_alu_op_fdot4       = 0x3E,
114         midgard_alu_op_freduce     = 0x3F,
115 
116         midgard_alu_op_iadd        = 0x40,
117         midgard_alu_op_ishladd     = 0x41, /* (a<<1) + b */
118         midgard_alu_op_isub        = 0x46,
119         midgard_alu_op_ishlsub     = 0x47, /* (a<<1) - b */
120         midgard_alu_op_iaddsat     = 0x48,
121         midgard_alu_op_uaddsat     = 0x49,
122         midgard_alu_op_isubsat     = 0x4E,
123         midgard_alu_op_usubsat     = 0x4F,
124 
125         midgard_alu_op_imul        = 0x58,
126         /* Multiplies two ints and stores the result in the next larger datasize. */
127         midgard_alu_op_iwmul       = 0x59, /* sint * sint = sint */
128         midgard_alu_op_uwmul       = 0x5A, /* uint * uint = uint */
129         midgard_alu_op_iuwmul      = 0x5B, /* sint * uint = sint */
130 
131         midgard_alu_op_imin        = 0x60,
132         midgard_alu_op_umin        = 0x61,
133         midgard_alu_op_imax        = 0x62,
134         midgard_alu_op_umax        = 0x63,
135         midgard_alu_op_iavg        = 0x64,
136         midgard_alu_op_uavg        = 0x65,
137         midgard_alu_op_iravg       = 0x66,
138         midgard_alu_op_uravg       = 0x67,
139         midgard_alu_op_iasr        = 0x68,
140         midgard_alu_op_ilsr        = 0x69,
141         midgard_alu_op_ishlsat     = 0x6C,
142         midgard_alu_op_ushlsat     = 0x6D,
143         midgard_alu_op_ishl        = 0x6E,
144 
145         midgard_alu_op_iand        = 0x70,
146         midgard_alu_op_ior         = 0x71,
147         midgard_alu_op_inand       = 0x72, /* ~(a & b), for inot let a = b */
148         midgard_alu_op_inor        = 0x73, /* ~(a | b) */
149         midgard_alu_op_iandnot     = 0x74, /* (a & ~b), used for not/b2f */
150         midgard_alu_op_iornot      = 0x75, /* (a | ~b) */
151         midgard_alu_op_ixor        = 0x76,
152         midgard_alu_op_inxor       = 0x77, /* ~(a ^ b) */
153         midgard_alu_op_iclz        = 0x78, /* Number of zeroes on left */
154         midgard_alu_op_ipopcnt     = 0x7A, /* Population count */
155         midgard_alu_op_imov        = 0x7B,
156         midgard_alu_op_iabsdiff    = 0x7C,
157         midgard_alu_op_uabsdiff    = 0x7D,
158         midgard_alu_op_ichoose     = 0x7E, /* vector, component number - dupe for shuffle() */
159 
160         midgard_alu_op_feq         = 0x80,
161         midgard_alu_op_fne         = 0x81,
162         midgard_alu_op_flt         = 0x82,
163         midgard_alu_op_fle         = 0x83,
164         midgard_alu_op_fball_eq    = 0x88,
165         midgard_alu_op_fball_neq   = 0x89,
166         midgard_alu_op_fball_lt    = 0x8A, /* all(lessThan(.., ..)) */
167         midgard_alu_op_fball_lte   = 0x8B, /* all(lessThanEqual(.., ..)) */
168 
169         midgard_alu_op_fbany_eq    = 0x90,
170         midgard_alu_op_fbany_neq   = 0x91,
171         midgard_alu_op_fbany_lt    = 0x92, /* any(lessThan(.., ..)) */
172         midgard_alu_op_fbany_lte   = 0x93, /* any(lessThanEqual(.., ..)) */
173 
174         midgard_alu_op_f2i_rte     = 0x98,
175         midgard_alu_op_f2i_rtz     = 0x99,
176         midgard_alu_op_f2i_rtn     = 0x9A,
177         midgard_alu_op_f2i_rtp     = 0x9B,
178         midgard_alu_op_f2u_rte     = 0x9C,
179         midgard_alu_op_f2u_rtz     = 0x9D,
180         midgard_alu_op_f2u_rtn     = 0x9E,
181         midgard_alu_op_f2u_rtp     = 0x9F,
182 
183         midgard_alu_op_ieq         = 0xA0,
184         midgard_alu_op_ine         = 0xA1,
185         midgard_alu_op_ult         = 0xA2,
186         midgard_alu_op_ule         = 0xA3,
187         midgard_alu_op_ilt         = 0xA4,
188         midgard_alu_op_ile         = 0xA5,
189         midgard_alu_op_iball_eq    = 0xA8,
190         midgard_alu_op_iball_neq   = 0xA9,
191         midgard_alu_op_uball_lt    = 0xAA,
192         midgard_alu_op_uball_lte   = 0xAB,
193         midgard_alu_op_iball_lt    = 0xAC,
194         midgard_alu_op_iball_lte   = 0xAD,
195 
196         midgard_alu_op_ibany_eq    = 0xB0,
197         midgard_alu_op_ibany_neq   = 0xB1,
198         midgard_alu_op_ubany_lt    = 0xB2,
199         midgard_alu_op_ubany_lte   = 0xB3,
200         midgard_alu_op_ibany_lt    = 0xB4, /* any(lessThan(.., ..)) */
201         midgard_alu_op_ibany_lte   = 0xB5, /* any(lessThanEqual(.., ..)) */
202         midgard_alu_op_i2f_rte     = 0xB8,
203         midgard_alu_op_i2f_rtz     = 0xB9,
204         midgard_alu_op_i2f_rtn     = 0xBA,
205         midgard_alu_op_i2f_rtp     = 0xBB,
206         midgard_alu_op_u2f_rte     = 0xBC,
207         midgard_alu_op_u2f_rtz     = 0xBD,
208         midgard_alu_op_u2f_rtn     = 0xBE,
209         midgard_alu_op_u2f_rtp     = 0xBF,
210 
211         /* All csel* instructions use as a condition the output of the previous
212          * vector or scalar unit, thus it must run on the second pipeline stage
213          * and be scheduled to the same bundle as the opcode that it uses as a
214          * condition. */
215         midgard_alu_op_icsel_v     = 0xC0,
216         midgard_alu_op_icsel       = 0xC1,
217         midgard_alu_op_fcsel_v     = 0xC4,
218         midgard_alu_op_fcsel       = 0xC5,
219         midgard_alu_op_froundaway  = 0xC6, /* round to nearest away */
220 
221         midgard_alu_op_fatan2_pt2  = 0xE8,
222         midgard_alu_op_fpow_pt1    = 0xEC,
223         midgard_alu_op_fpown_pt1   = 0xED,
224         midgard_alu_op_fpowr_pt1   = 0xEE,
225 
226         midgard_alu_op_frcp        = 0xF0,
227         midgard_alu_op_frsqrt      = 0xF2,
228         midgard_alu_op_fsqrt       = 0xF3,
229         midgard_alu_op_fexp2       = 0xF4,
230         midgard_alu_op_flog2       = 0xF5,
231         midgard_alu_op_fsinpi      = 0xF6, /* sin(pi * x) */
232         midgard_alu_op_fcospi      = 0xF7, /* cos(pi * x) */
233         midgard_alu_op_fatan2_pt1  = 0xF9,
234 } midgard_alu_op;
235 
236 typedef enum {
237         midgard_outmod_none        = 0,
238         midgard_outmod_clamp_0_inf = 1, /* max(x, 0.0), NaNs become +0.0 */
239         midgard_outmod_clamp_m1_1  = 2, /* clamp(x, -1.0, 1.0), NaNs become -1.0 */
240         midgard_outmod_clamp_0_1   = 3  /* clamp(x, 0.0, 1.0), NaNs become +0.0 */
241 } midgard_outmod_float;
242 
243 /* These are applied to the resulting value that's going to be stored in the dest reg.
244  * This should be set to midgard_outmod_keeplo when shrink_mode is midgard_shrink_mode_none. */
245 typedef enum {
246         midgard_outmod_ssat   = 0,
247         midgard_outmod_usat   = 1,
248         midgard_outmod_keeplo = 2, /* Keep low half */
249         midgard_outmod_keephi = 3, /* Keep high half */
250 } midgard_outmod_int;
251 
252 typedef enum {
253         midgard_reg_mode_8  = 0,
254         midgard_reg_mode_16 = 1,
255         midgard_reg_mode_32 = 2,
256         midgard_reg_mode_64 = 3
257 } midgard_reg_mode;
258 
259 typedef enum {
260         midgard_shrink_mode_lower = 0,
261         midgard_shrink_mode_upper = 1,
262         midgard_shrink_mode_none  = 2
263 } midgard_shrink_mode;
264 
265 /* Only used if midgard_src_expand_mode is set to one of midgard_src_expand_*. */
266 typedef enum {
267         midgard_int_sign_extend = 0,
268         midgard_int_zero_extend = 1,
269         midgard_int_replicate   = 2,
270         midgard_int_left_shift  = 3
271 } midgard_int_mod;
272 
273 /* Unlike midgard_int_mod, fload modifiers are applied after the expansion happens, so
274  * they don't depend on midgard_src_expand_mode. */
275 #define MIDGARD_FLOAT_MOD_ABS (1 << 0)
276 #define MIDGARD_FLOAT_MOD_NEG (1 << 1)
277 
278 /* The expand options depend on both midgard_int_mod and midgard_reg_mode.  For
279  * example, a vec4 with midgard_int_sign_extend and midgard_src_expand_low is
280  * treated as a vec8 and each 16-bit element from the low 64-bits is then sign
281  * extended, resulting in a vec4 where each 32-bit element corresponds to a
282  * 16-bit element from the low 64-bits of the input vector. */
283 typedef enum {
284         midgard_src_passthrough = 0,
285         midgard_src_rep_low = 1, /* replicate lower 64 bits to higher 64 bits */
286         midgard_src_rep_high = 2, /* replicate higher 64 bits to lower 64 bits */
287         midgard_src_swap = 3, /* swap lower 64 bits with higher 64 bits */
288         midgard_src_expand_low = 4, /* expand low 64 bits */
289         midgard_src_expand_high = 5, /* expand high 64 bits */
290         midgard_src_expand_low_swap = 6, /* expand low 64 bits, then swap */
291         midgard_src_expand_high_swap = 7, /* expand high 64 bits, then swap */
292 } midgard_src_expand_mode;
293 
294 #define INPUT_EXPANDS(a) \
295         (a >= midgard_src_expand_low && a <= midgard_src_expand_high_swap)
296 
297 #define INPUT_SWAPS(a) \
298         (a == midgard_src_swap || a >= midgard_src_expand_low_swap)
299 
300 typedef struct
301 __attribute__((__packed__))
302 {
303         /* Either midgard_int_mod or from midgard_float_mod_*, depending on the
304          * type of op */
305         unsigned mod : 2;
306         midgard_src_expand_mode expand_mode : 3;
307         unsigned swizzle : 8;
308 }
309 midgard_vector_alu_src;
310 
311 typedef struct
312 __attribute__((__packed__))
313 {
314         midgard_alu_op op               :  8;
315         midgard_reg_mode reg_mode       :  2;
316         unsigned src1                   : 13;
317         unsigned src2                   : 13;
318         midgard_shrink_mode shrink_mode :  2;
319         unsigned outmod                 :  2;
320         unsigned mask                   :  8;
321 }
322 midgard_vector_alu;
323 
324 typedef struct
325 __attribute__((__packed__))
326 {
327         unsigned mod       : 2;
328         bool full          : 1; /* 0 = 16-bit, 1 = 32-bit */
329         unsigned component : 3;
330 }
331 midgard_scalar_alu_src;
332 
333 typedef struct
334 __attribute__((__packed__))
335 {
336         midgard_alu_op op         :  8;
337         unsigned src1             :  6;
338         /* last 5 bits are used when src2 is an immediate */
339         unsigned src2             : 11;
340         unsigned reserved         :  1;
341         unsigned outmod           :  2;
342         bool output_full          :  1;
343         unsigned output_component :  3;
344 }
345 midgard_scalar_alu;
346 
347 typedef struct
348 __attribute__((__packed__))
349 {
350         unsigned src1_reg : 5;
351         unsigned src2_reg : 5;
352         unsigned out_reg  : 5;
353         bool src2_imm     : 1;
354 }
355 midgard_reg_info;
356 
357 /* In addition to conditional branches and jumps (unconditional branches),
358  * Midgard implements a bit of fixed function functionality used in fragment
359  * shaders via specially crafted branches. These have special branch opcodes,
360  * which perform a fixed-function operation and/or use the results of a
361  * fixed-function operation as the branch condition.  */
362 
363 typedef enum {
364         /* Regular branches */
365         midgard_jmp_writeout_op_branch_uncond = 1,
366         midgard_jmp_writeout_op_branch_cond = 2,
367 
368         /* In a fragment shader, execute a discard_if instruction, with the
369          * corresponding condition code. Terminates the shader, so generally
370          * set the branch target to out of the shader */
371         midgard_jmp_writeout_op_discard = 4,
372 
373         /* Branch if the tilebuffer is not yet ready. At the beginning of a
374          * fragment shader that reads from the tile buffer, for instance via
375          * ARM_shader_framebuffer_fetch or EXT_pixel_local_storage, this branch
376          * operation should be used as a loop. An instruction like
377          * "br.tilebuffer.always -1" does the trick, corresponding to
378          * "while(!is_tilebuffer_ready) */
379         midgard_jmp_writeout_op_tilebuffer_pending = 6,
380 
381         /* In a fragment shader, try to write out the value pushed to r0 to the
382          * tilebuffer, subject to state in r1.z and r1.w. If this
383          * succeeds, the shader terminates. If it fails, it branches to the
384          * specified branch target. Generally, this should be used in a loop to
385          * itself, acting as "do { write(r0); } while(!write_successful);" */
386         midgard_jmp_writeout_op_writeout = 7,
387 } midgard_jmp_writeout_op;
388 
389 typedef enum {
390         midgard_condition_write0 = 0,
391 
392         /* These condition codes denote a conditional branch on FALSE and on
393          * TRUE respectively */
394         midgard_condition_false = 1,
395         midgard_condition_true = 2,
396 
397         /* This condition code always branches. For a pure branch, the
398          * unconditional branch coding should be used instead, but for
399          * fixed-function branch opcodes, this is still useful */
400         midgard_condition_always = 3,
401 } midgard_condition;
402 
403 enum midgard_call_mode {
404         midgard_call_mode_default = 1,
405         midgard_call_mode_call = 2,
406         midgard_call_mode_return = 3
407 };
408 
409 typedef struct
410 __attribute__((__packed__))
411 {
412         midgard_jmp_writeout_op op : 3; /* == branch_uncond */
413         unsigned dest_tag : 4; /* tag of branch destination */
414         enum midgard_call_mode call_mode : 2;
415         int offset : 7;
416 }
417 midgard_branch_uncond;
418 
419 typedef struct
420 __attribute__((__packed__))
421 {
422         midgard_jmp_writeout_op op : 3; /* == branch_cond */
423         unsigned dest_tag : 4; /* tag of branch destination */
424         int offset : 7;
425         midgard_condition cond : 2;
426 }
427 midgard_branch_cond;
428 
429 typedef struct
430 __attribute__((__packed__))
431 {
432         midgard_jmp_writeout_op op : 3; /* == branch_cond */
433         unsigned dest_tag : 4; /* tag of branch destination */
434         enum midgard_call_mode call_mode : 2;
435         signed offset : 23;
436 
437         /* Extended branches permit inputting up to 4 conditions loaded into
438          * r31 (two in r31.w and two in r31.x). In the most general case, we
439          * specify a function f(A, B, C, D) mapping 4 1-bit conditions to a
440          * single 1-bit branch criteria. Note that the domain of f has 2^(2^4)
441          * elements, each mapping to 1-bit of output, so we can trivially
442          * construct a Godel numbering of f as a (2^4)=16-bit integer. This
443          * 16-bit integer serves as a lookup table to compute f, subject to
444          * some swaps for ordering.
445          *
446          * Interesting, the standard 2-bit condition codes are also a LUT with
447          * the same format (2^1-bit), but it's usually easier to use enums. */
448 
449         unsigned cond : 16;
450 }
451 midgard_branch_extended;
452 
453 typedef struct
454 __attribute__((__packed__))
455 {
456         midgard_jmp_writeout_op op : 3; /* == writeout */
457         unsigned unknown : 13;
458 }
459 midgard_writeout;
460 
461 /*
462  * Load/store words
463  */
464 
465 typedef enum {
466         midgard_op_ld_st_noop   = 0x03,
467 
468         /* Unpacks a colour from a native format to <format> */
469         midgard_op_unpack_colour_f32 = 0x04,
470         midgard_op_unpack_colour_f16 = 0x05,
471         midgard_op_unpack_colour_u32 = 0x06,
472         midgard_op_unpack_colour_s32 = 0x07,
473 
474         /* Packs a colour from <format> to a native format */
475         midgard_op_pack_colour_f32 = 0x08,
476         midgard_op_pack_colour_f16 = 0x09,
477         midgard_op_pack_colour_u32 = 0x0A,
478         midgard_op_pack_colour_s32 = 0x0B,
479 
480         /* Computes the effective address of a mem address expression */
481         midgard_op_lea = 0x0C,
482 
483         /* Converts image coordinates into mem address */
484         midgard_op_lea_image = 0x0D,
485 
486         /* Unclear why this is on the L/S unit, but moves fp32 cube map
487          * coordinates in r27 to its cube map texture coordinate destination
488          * (e.g r29). */
489 
490         midgard_op_ld_cubemap_coords = 0x0E,
491 
492         /* A mov between registers that the ldst pipeline can access */
493         midgard_op_ldst_mov = 0x10,
494 
495         /* The L/S unit can do perspective division a clock faster than the ALU
496          * if you're lucky. Put the vec4 in r27, and call with 0x24 as the
497          * unknown state; the output will be <x/w, y/w, z/w, 1>. Replace w with
498          * z for the z version */
499         midgard_op_ldst_perspective_div_y = 0x11,
500         midgard_op_ldst_perspective_div_z = 0x12,
501         midgard_op_ldst_perspective_div_w = 0x13,
502 
503         /* val in r27.y, address embedded, outputs result to argument. Invert val for sub. Let val = +-1 for inc/dec. */
504         midgard_op_atomic_add = 0x40,
505         midgard_op_atomic_add64 = 0x41,
506         midgard_op_atomic_add_be = 0x42,
507         midgard_op_atomic_add64_be = 0x43,
508 
509         midgard_op_atomic_and = 0x44,
510         midgard_op_atomic_and64 = 0x45,
511         midgard_op_atomic_and_be = 0x46,
512         midgard_op_atomic_and64_be = 0x47,
513         midgard_op_atomic_or = 0x48,
514         midgard_op_atomic_or64 = 0x49,
515         midgard_op_atomic_or_be = 0x4A,
516         midgard_op_atomic_or64_be = 0x4B,
517         midgard_op_atomic_xor = 0x4C,
518         midgard_op_atomic_xor64 = 0x4D,
519         midgard_op_atomic_xor_be = 0x4E,
520         midgard_op_atomic_xor64_be = 0x4F,
521 
522         midgard_op_atomic_imin = 0x50,
523         midgard_op_atomic_imin64 = 0x51,
524         midgard_op_atomic_imin_be = 0x52,
525         midgard_op_atomic_imin64_be = 0x53,
526         midgard_op_atomic_umin = 0x54,
527         midgard_op_atomic_umin64 = 0x55,
528         midgard_op_atomic_umin_be = 0x56,
529         midgard_op_atomic_umin64_be = 0x57,
530         midgard_op_atomic_imax = 0x58,
531         midgard_op_atomic_imax64 = 0x59,
532         midgard_op_atomic_imax_be = 0x5A,
533         midgard_op_atomic_imax64_be = 0x5B,
534         midgard_op_atomic_umax = 0x5C,
535         midgard_op_atomic_umax64 = 0x5D,
536         midgard_op_atomic_umax_be = 0x5E,
537         midgard_op_atomic_umax64_be = 0x5F,
538 
539         midgard_op_atomic_xchg = 0x60,
540         midgard_op_atomic_xchg64 = 0x61,
541         midgard_op_atomic_xchg_be = 0x62,
542         midgard_op_atomic_xchg64_be = 0x63,
543 
544         midgard_op_atomic_cmpxchg = 0x64,
545         midgard_op_atomic_cmpxchg64 = 0x65,
546         midgard_op_atomic_cmpxchg_be = 0x66,
547         midgard_op_atomic_cmpxchg64_be = 0x67,
548 
549         /* Used for compute shader's __global arguments, __local
550          * variables (or for register spilling) */
551 
552         midgard_op_ld_u8         = 0x80, /* zero extends */
553         midgard_op_ld_i8         = 0x81, /* sign extends */
554         midgard_op_ld_u16        = 0x84, /* zero extends */
555         midgard_op_ld_i16        = 0x85, /* sign extends */
556         midgard_op_ld_u16_be     = 0x86, /* zero extends, big endian */
557         midgard_op_ld_i16_be     = 0x87, /* sign extends, big endian */
558         midgard_op_ld_32         = 0x88, /* short2, int, float */
559         midgard_op_ld_32_bswap2  = 0x89, /* 16-bit big endian vector */
560         midgard_op_ld_32_bswap4  = 0x8A, /* 32-bit big endian scalar */
561         midgard_op_ld_64         = 0x8C, /* int2, float2, long */
562         midgard_op_ld_64_bswap2  = 0x8D, /* 16-bit big endian vector */
563         midgard_op_ld_64_bswap4  = 0x8E, /* 32-bit big endian vector */
564         midgard_op_ld_64_bswap8  = 0x8F, /* 64-bit big endian scalar */
565         midgard_op_ld_128        = 0x90, /* float4, long2 */
566         midgard_op_ld_128_bswap2 = 0x91, /* 16-bit big endian vector */
567         midgard_op_ld_128_bswap4 = 0x92, /* 32-bit big endian vector */
568         midgard_op_ld_128_bswap8 = 0x93, /* 64-bit big endian vector */
569 
570         midgard_op_ld_attr_32 = 0x94,
571         midgard_op_ld_attr_16 = 0x95,
572         midgard_op_ld_attr_32u = 0x96,
573         midgard_op_ld_attr_32i = 0x97,
574         midgard_op_ld_vary_32 = 0x98,
575         midgard_op_ld_vary_16 = 0x99,
576         midgard_op_ld_vary_32u = 0x9A,
577         midgard_op_ld_vary_32i = 0x9B,
578 
579         /* This instruction behaves differently depending if the gpu is a v4
580          * or a newer gpu. The main difference hinges on which values of the
581          * second argument are valid for each gpu.
582          * TODO: properly document and decode each possible value for the
583          * second argument. */
584         midgard_op_ld_special_32f = 0x9C,
585         midgard_op_ld_special_16f = 0x9D,
586         midgard_op_ld_special_32u = 0x9E,
587         midgard_op_ld_special_32i = 0x9F,
588 
589         /* The distinction between these ops is the alignment
590          * requirement / accompanying shift. Thus, the offset to
591          * ld_ubo_128 is in 16-byte units and can load 128-bit. The
592          * offset to ld_ubo_64 is in 8-byte units; ld_ubo_32 in 4-byte
593          * units. */
594         midgard_op_ld_ubo_u8         = 0xA0, /* theoretical */
595         midgard_op_ld_ubo_i8         = 0xA1, /* theoretical */
596         midgard_op_ld_ubo_u16        = 0xA4, /* theoretical */
597         midgard_op_ld_ubo_i16        = 0xA5, /* theoretical */
598         midgard_op_ld_ubo_u16_be     = 0xA6, /* theoretical */
599         midgard_op_ld_ubo_i16_be     = 0xA7, /* theoretical */
600         midgard_op_ld_ubo_32         = 0xA8,
601         midgard_op_ld_ubo_32_bswap2  = 0xA9,
602         midgard_op_ld_ubo_32_bswap4  = 0xAA,
603         midgard_op_ld_ubo_64         = 0xAC,
604         midgard_op_ld_ubo_64_bswap2  = 0xAD,
605         midgard_op_ld_ubo_64_bswap4  = 0xAE,
606         midgard_op_ld_ubo_64_bswap8  = 0xAF,
607         midgard_op_ld_ubo_128        = 0xB0,
608         midgard_op_ld_ubo_128_bswap2 = 0xB1,
609         midgard_op_ld_ubo_128_bswap4 = 0xB2,
610         midgard_op_ld_ubo_128_bswap8 = 0xB3,
611 
612         midgard_op_ld_image_32f = 0xB4,
613         midgard_op_ld_image_16f = 0xB5,
614         midgard_op_ld_image_32u = 0xB6,
615         midgard_op_ld_image_32i = 0xB7,
616 
617         /* Only works on v5 or newer.
618          * Older cards must use ld_special with tilebuffer selectors. */
619         midgard_op_ld_tilebuffer_32f = 0xB8,
620         midgard_op_ld_tilebuffer_16f = 0xB9,
621         midgard_op_ld_tilebuffer_raw = 0xBA,
622 
623         midgard_op_st_u8         = 0xC0, /* zero extends */
624         midgard_op_st_i8         = 0xC1, /* sign extends */
625         midgard_op_st_u16        = 0xC4, /* zero extends */
626         midgard_op_st_i16        = 0xC5, /* sign extends */
627         midgard_op_st_u16_be     = 0xC6, /* zero extends, big endian */
628         midgard_op_st_i16_be     = 0xC7, /* sign extends, big endian */
629         midgard_op_st_32         = 0xC8, /* short2, int, float */
630         midgard_op_st_32_bswap2  = 0xC9, /* 16-bit big endian vector */
631         midgard_op_st_32_bswap4  = 0xCA, /* 32-bit big endian scalar */
632         midgard_op_st_64         = 0xCC, /* int2, float2, long */
633         midgard_op_st_64_bswap2  = 0xCD, /* 16-bit big endian vector */
634         midgard_op_st_64_bswap4  = 0xCE, /* 32-bit big endian vector */
635         midgard_op_st_64_bswap8  = 0xCF, /* 64-bit big endian scalar */
636         midgard_op_st_128        = 0xD0, /* float4, long2 */
637         midgard_op_st_128_bswap2 = 0xD1, /* 16-bit big endian vector */
638         midgard_op_st_128_bswap4 = 0xD2, /* 32-bit big endian vector */
639         midgard_op_st_128_bswap8 = 0xD3, /* 64-bit big endian vector */
640 
641         midgard_op_st_vary_32 = 0xD4,
642         midgard_op_st_vary_16 = 0xD5,
643         midgard_op_st_vary_32u = 0xD6,
644         midgard_op_st_vary_32i = 0xD7,
645 
646         /* Value to st in r27, location r26.w as short2 */
647         midgard_op_st_image_32f = 0xD8,
648         midgard_op_st_image_16f = 0xD9,
649         midgard_op_st_image_32u = 0xDA,
650         midgard_op_st_image_32i = 0xDB,
651 
652         midgard_op_st_special_32f = 0xDC,
653         midgard_op_st_special_16f = 0xDD,
654         midgard_op_st_special_32u = 0xDE,
655         midgard_op_st_special_32i = 0xDF,
656 
657         /* Only works on v5 or newer.
658          * Older cards must use ld_special with tilebuffer selectors. */
659         midgard_op_st_tilebuffer_32f = 0xE8,
660         midgard_op_st_tilebuffer_16f = 0xE9,
661         midgard_op_st_tilebuffer_raw = 0xEA,
662         midgard_op_trap = 0xFC,
663 } midgard_load_store_op;
664 
665 typedef enum {
666         midgard_interp_sample = 0,
667         midgard_interp_centroid = 1,
668         midgard_interp_default = 2
669 } midgard_interpolation;
670 
671 typedef enum {
672         midgard_varying_mod_none = 0,
673 
674         /* Take the would-be result and divide all components by its y/z/w
675          * (perspective division baked in with the load)  */
676         midgard_varying_mod_perspective_y = 1,
677         midgard_varying_mod_perspective_z = 2,
678         midgard_varying_mod_perspective_w = 3,
679 
680         /* The result is a 64-bit cubemap descriptor to use with
681          * midgard_tex_op_normal or midgard_tex_op_gradient */
682         midgard_varying_mod_cubemap = 4,
683 } midgard_varying_modifier;
684 
685 typedef struct
686 __attribute__((__packed__))
687 {
688         midgard_varying_modifier modifier : 3;
689 
690         bool flat_shading : 1;
691 
692         /* These are ignored if flat_shading is enabled. */
693         bool perspective_correction : 1;
694         bool centroid_mapping : 1;
695 
696         /* This is ignored if the shader only runs once per pixel. */
697         bool interpolate_sample : 1;
698 
699         bool zero0 : 1; /* Always zero */
700 
701         unsigned direct_sample_pos_x : 4;
702         unsigned direct_sample_pos_y : 4;
703 }
704 midgard_varying_params;
705 
706 /* 8-bit register/etc selector for load/store ops */
707 typedef struct
708 __attribute__((__packed__))
709 {
710         /* Indexes into the register */
711         unsigned component : 2;
712 
713         /* Register select between r26/r27 */
714         unsigned select : 1;
715 
716         unsigned unknown : 2;
717 
718         /* Like any good Arm instruction set, load/store arguments can be
719          * implicitly left-shifted... but only the second argument. Zero for no
720          * shifting, up to <<7 possible though. This is useful for indexing.
721          *
722          * For the first argument, it's unknown what these bits mean */
723         unsigned shift : 3;
724 }
725 midgard_ldst_register_select;
726 
727 typedef enum {
728         /* 0 is reserved */
729         midgard_index_address_u64 = 1,
730         midgard_index_address_u32 = 2,
731         midgard_index_address_s32 = 3,
732 } midgard_index_address_format;
733 
734 typedef struct
735 __attribute__((__packed__))
736 {
737         midgard_load_store_op op : 8;
738 
739         /* Source/dest reg */
740         unsigned reg  : 5;
741 
742         /* Generally is a writemask.
743          * For ST_ATTR and ST_TEX, unused.
744          * For other stores, each bit masks 1/4th of the output. */
745         unsigned mask : 4;
746 
747         /* Swizzle for stores, but for atomics it encodes also the source
748          * register. This fits because atomics dont need a swizzle since they
749          * are not vectorized instructions. */
750         unsigned swizzle : 8;
751 
752         /* Arg reg, meaning changes according to each opcode */
753         unsigned arg_comp : 2;
754         unsigned arg_reg  : 3;
755 
756         /* 64-bit address enable
757          * 32-bit data type enable for CUBEMAP and perspective div.
758          * Explicit indexing enable for LD_ATTR.
759          * 64-bit coordinate enable for LD_IMAGE. */
760         bool bitsize_toggle : 1;
761 
762         /* These are mainly used for opcodes that have addresses.
763          * For cmpxchg, index_reg is used for the comparison value.
764          * For ops that access the attrib table, bit 1 encodes which table.
765          * For LD_VAR and LD/ST_ATTR, bit 0 enables dest/src type inferral. */
766         midgard_index_address_format index_format : 2;
767         unsigned index_comp  : 2;
768         unsigned index_reg   : 3;
769         unsigned index_shift : 4;
770 
771         /* Generaly is a signed offset, but has different bitsize and starts at
772          * different bits depending on the opcode, LDST_*_DISPLACEMENT helpers
773          * are recommended when packing/unpacking this attribute.
774          * For LD_UBO, bit 0 enables ubo index immediate.
775          * For LD_TILEBUFFER_RAW, bit 0 disables sample index immediate. */
776         int signed_offset : 18;
777 }
778 midgard_load_store_word;
779 
780 typedef struct
781 __attribute__((__packed__))
782 {
783         unsigned type      : 4;
784         unsigned next_type : 4;
785         uint64_t word1     : 60;
786         uint64_t word2     : 60;
787 }
788 midgard_load_store;
789 
790 /* 8-bit register selector used in texture ops to select a bias/LOD/gradient
791  * register, shoved into the `bias` field */
792 
793 typedef struct
794 __attribute__((__packed__))
795 {
796         /* 32-bit register, clear for half-register */
797         unsigned full : 1;
798 
799         /* Register select between r28/r29 */
800         unsigned select : 1;
801 
802         /* For a half-register, selects the upper half */
803         unsigned upper : 1;
804 
805         /* Indexes into the register */
806         unsigned component : 2;
807 
808         /* Padding to make this 8-bit */
809         unsigned zero : 3;
810 }
811 midgard_tex_register_select;
812 
813 /* Texture pipeline results are in r28-r29 */
814 #define REG_TEX_BASE 28
815 
816 enum mali_texture_op {
817         /* [texture + LOD bias]
818          * If the texture is mipmapped, barriers must be enabled in the
819          * instruction word in order for this opcode to compute the output
820          * correctly. */
821         midgard_tex_op_normal = 1,
822 
823         /* [texture + gradient for LOD and anisotropy]
824          * Unlike midgard_tex_op_normal, this opcode does not require barriers
825          * to compute the output correctly. */
826         midgard_tex_op_gradient = 2,
827 
828         /* [unfiltered texturing]
829          * Unlike midgard_tex_op_normal, this opcode does not require barriers
830          * to compute the output correctly. */
831         midgard_tex_op_fetch = 4,
832 
833         /* [gradient from derivative] */
834         midgard_tex_op_grad_from_derivative = 9,
835 
836         /* [mov] */
837         midgard_tex_op_mov = 10,
838 
839         /* [noop]
840          * Mostly used for barriers. */
841         midgard_tex_op_barrier = 11,
842 
843         /* [gradient from coords] */
844         midgard_tex_op_grad_from_coords = 12,
845 
846         /* [derivative]
847          * Computes derivatives in 2x2 fragment blocks. */
848         midgard_tex_op_derivative = 13
849 };
850 
851 enum mali_sampler_type {
852         /* 0 is reserved */
853         MALI_SAMPLER_FLOAT      = 0x1, /* sampler */
854         MALI_SAMPLER_UNSIGNED   = 0x2, /* usampler */
855         MALI_SAMPLER_SIGNED     = 0x3, /* isampler */
856 };
857 
858 /* Texture modes */
859 enum mali_texture_mode {
860         TEXTURE_NORMAL = 1,
861         TEXTURE_SHADOW = 5,
862         TEXTURE_GATHER_SHADOW = 6,
863         TEXTURE_GATHER_X = 8,
864         TEXTURE_GATHER_Y = 9,
865         TEXTURE_GATHER_Z = 10,
866         TEXTURE_GATHER_W = 11,
867 };
868 
869 enum mali_derivative_mode {
870         TEXTURE_DFDX = 0,
871         TEXTURE_DFDY = 1,
872 };
873 
874 enum midgard_partial_execution {
875         MIDGARD_PARTIAL_EXECUTION_SKIP = 1,
876         MIDGARD_PARTIAL_EXECUTION_KILL = 2,
877         MIDGARD_PARTIAL_EXECUTION_NONE = 3
878 };
879 
880 typedef struct
881 __attribute__((__packed__))
882 {
883         unsigned type      : 4;
884         unsigned next_type : 4;
885 
886         enum mali_texture_op op  : 4;
887         unsigned mode : 4;
888         enum midgard_partial_execution exec : 2;
889 
890         unsigned format : 2;
891 
892         /* Are sampler_handle/texture_handler respectively set by registers? If
893          * true, the lower 8-bits of the respective field is a register word.
894          * If false, they are an immediate */
895 
896         unsigned sampler_register : 1;
897         unsigned texture_register : 1;
898 
899         /* Is a register used to specify the
900          * LOD/bias/offset? If set, use the `bias` field as
901          * a register index. If clear, use the `bias` field
902          * as an immediate. */
903         unsigned lod_register : 1;
904 
905         /* Is a register used to specify an offset? If set, use the
906          * offset_reg_* fields to encode this, duplicated for each of the
907          * components. If clear, there is implcitly always an immediate offst
908          * specificed in offset_imm_* */
909         unsigned offset_register : 1;
910 
911         unsigned in_reg_full  : 1;
912         unsigned in_reg_select : 1;
913         unsigned in_reg_upper  : 1;
914         unsigned in_reg_swizzle : 8;
915 
916         unsigned unknown8  : 2;
917 
918         unsigned out_full  : 1;
919 
920         enum mali_sampler_type sampler_type : 2;
921 
922         unsigned out_reg_select : 1;
923         unsigned out_upper : 1;
924 
925         unsigned mask : 4;
926 
927         /* Intriguingly, textures can take an outmod just like alu ops. Int
928          * outmods are not supported as far as I can tell, so this is only
929          * meaningful for float samplers */
930         midgard_outmod_float outmod  : 2;
931 
932         unsigned swizzle  : 8;
933 
934          /* These indicate how many bundles after this texture op may be
935           * executed in parallel with this op. We may execute only ALU and
936          * ld/st in parallel (not other textures), and obviously there cannot
937          * be any dependency (the blob appears to forbid even accessing other
938          * channels of a given texture register). */
939 
940         unsigned out_of_order   : 2;
941         unsigned unknown4  : 10;
942 
943         /* In immediate mode, each offset field is an immediate range [0, 7].
944          *
945          * In register mode, offset_x becomes a register (full, select, upper)
946          * triplet followed by a vec3 swizzle is splattered across
947          * offset_y/offset_z in a genuinely bizarre way.
948          *
949          * For texel fetches in immediate mode, the range is the full [-8, 7],
950          * but for normal texturing the top bit must be zero and a register
951          * used instead. It's not clear where this limitation is from.
952          *
953          * union {
954          *      struct {
955          *              signed offset_x  : 4;
956          *              signed offset_y  : 4;
957          *              signed offset_z  : 4;
958          *      } immediate;
959          *      struct {
960          *              bool full        : 1;
961          *              bool select      : 1;
962          *              bool upper       : 1;
963          *              unsigned swizzle : 8;
964          *              unsigned zero    : 1;
965          *      } register;
966          * }
967          */
968 
969         unsigned offset : 12;
970 
971         /* In immediate bias mode, for a normal texture op, this is
972          * texture bias, computed as int(2^8 * frac(biasf)), with
973          * bias_int = floor(bias). For a textureLod, it's that, but
974          * s/bias/lod. For a texel fetch, this is the LOD as-is.
975          *
976          * In register mode, this is a midgard_tex_register_select
977          * structure and bias_int is zero */
978 
979         unsigned bias : 8;
980         signed bias_int  : 8;
981 
982         /* If sampler/texture_register is set, the bottom 8-bits are
983          * midgard_tex_register_select and the top 8-bits are zero. If they are
984          * clear, they are immediate texture indices */
985 
986         unsigned sampler_handle : 16;
987         unsigned texture_handle : 16;
988 }
989 midgard_texture_word;
990 
991 /* Technically barriers are texture instructions but it's less work to add them
992  * as an explicitly zeroed special case, since most fields are forced to go to
993  * zero */
994 
995 typedef struct
996 __attribute__((__packed__))
997 {
998         unsigned type      : 4;
999         unsigned next_type : 4;
1000 
1001         /* op = TEXTURE_OP_BARRIER */
1002         unsigned op  : 6;
1003         unsigned zero1    : 2;
1004 
1005         /* Since helper invocations don't make any sense, these are forced to one */
1006         unsigned cont  : 1;
1007         unsigned last  : 1;
1008         unsigned zero2 : 14;
1009 
1010         unsigned zero3 : 24;
1011         unsigned out_of_order : 4;
1012         unsigned zero4 : 4;
1013 
1014         uint64_t zero5;
1015 } midgard_texture_barrier_word;
1016 
1017 typedef union midgard_constants {
1018         double f64[2];
1019         uint64_t u64[2];
1020         int64_t i64[2];
1021         float f32[4];
1022         uint32_t u32[4];
1023         int32_t i32[4];
1024         uint16_t f16[8];
1025         uint16_t u16[8];
1026         int16_t i16[8];
1027         uint8_t u8[16];
1028         int8_t i8[16];
1029 }
1030 midgard_constants;
1031 
1032 enum midgard_roundmode {
1033         MIDGARD_RTE = 0x0, /* round to even */
1034         MIDGARD_RTZ = 0x1, /* round to zero */
1035         MIDGARD_RTN = 0x2, /* round to negative */
1036         MIDGARD_RTP = 0x3, /* round to positive */
1037 };
1038 
1039 #endif
1040