xref: /linux/drivers/net/ethernet/netronome/nfp/bpf/jit.c (revision 6c8c1406)
1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 /* Copyright (C) 2016-2018 Netronome Systems, Inc. */
3 
4 #define pr_fmt(fmt)	"NFP net bpf: " fmt
5 
6 #include <linux/bug.h>
7 #include <linux/bpf.h>
8 #include <linux/filter.h>
9 #include <linux/kernel.h>
10 #include <linux/pkt_cls.h>
11 #include <linux/reciprocal_div.h>
12 #include <linux/unistd.h>
13 
14 #include "main.h"
15 #include "../nfp_asm.h"
16 #include "../nfp_net_ctrl.h"
17 
18 /* --- NFP prog --- */
19 /* Foreach "multiple" entries macros provide pos and next<n> pointers.
20  * It's safe to modify the next pointers (but not pos).
21  */
22 #define nfp_for_each_insn_walk2(nfp_prog, pos, next)			\
23 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
24 	     next = list_next_entry(pos, l);			\
25 	     &(nfp_prog)->insns != &pos->l &&			\
26 	     &(nfp_prog)->insns != &next->l;			\
27 	     pos = nfp_meta_next(pos),				\
28 	     next = nfp_meta_next(pos))
29 
30 #define nfp_for_each_insn_walk3(nfp_prog, pos, next, next2)		\
31 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
32 	     next = list_next_entry(pos, l),			\
33 	     next2 = list_next_entry(next, l);			\
34 	     &(nfp_prog)->insns != &pos->l &&			\
35 	     &(nfp_prog)->insns != &next->l &&			\
36 	     &(nfp_prog)->insns != &next2->l;			\
37 	     pos = nfp_meta_next(pos),				\
38 	     next = nfp_meta_next(pos),				\
39 	     next2 = nfp_meta_next(next))
40 
41 static bool
42 nfp_meta_has_prev(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
43 {
44 	return meta->l.prev != &nfp_prog->insns;
45 }
46 
47 static void nfp_prog_push(struct nfp_prog *nfp_prog, u64 insn)
48 {
49 	if (nfp_prog->__prog_alloc_len / sizeof(u64) == nfp_prog->prog_len) {
50 		pr_warn("instruction limit reached (%u NFP instructions)\n",
51 			nfp_prog->prog_len);
52 		nfp_prog->error = -ENOSPC;
53 		return;
54 	}
55 
56 	nfp_prog->prog[nfp_prog->prog_len] = insn;
57 	nfp_prog->prog_len++;
58 }
59 
60 static unsigned int nfp_prog_current_offset(struct nfp_prog *nfp_prog)
61 {
62 	return nfp_prog->prog_len;
63 }
64 
65 static bool
66 nfp_prog_confirm_current_offset(struct nfp_prog *nfp_prog, unsigned int off)
67 {
68 	/* If there is a recorded error we may have dropped instructions;
69 	 * that doesn't have to be due to translator bug, and the translation
70 	 * will fail anyway, so just return OK.
71 	 */
72 	if (nfp_prog->error)
73 		return true;
74 	return !WARN_ON_ONCE(nfp_prog_current_offset(nfp_prog) != off);
75 }
76 
77 /* --- Emitters --- */
78 static void
79 __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
80 	   u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, enum cmd_ctx_swap ctx,
81 	   bool indir)
82 {
83 	u64 insn;
84 
85 	insn =	FIELD_PREP(OP_CMD_A_SRC, areg) |
86 		FIELD_PREP(OP_CMD_CTX, ctx) |
87 		FIELD_PREP(OP_CMD_B_SRC, breg) |
88 		FIELD_PREP(OP_CMD_TOKEN, cmd_tgt_act[op].token) |
89 		FIELD_PREP(OP_CMD_XFER, xfer) |
90 		FIELD_PREP(OP_CMD_CNT, size) |
91 		FIELD_PREP(OP_CMD_SIG, ctx != CMD_CTX_NO_SWAP) |
92 		FIELD_PREP(OP_CMD_TGT_CMD, cmd_tgt_act[op].tgt_cmd) |
93 		FIELD_PREP(OP_CMD_INDIR, indir) |
94 		FIELD_PREP(OP_CMD_MODE, mode);
95 
96 	nfp_prog_push(nfp_prog, insn);
97 }
98 
99 static void
100 emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
101 	     swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx, bool indir)
102 {
103 	struct nfp_insn_re_regs reg;
104 	int err;
105 
106 	err = swreg_to_restricted(reg_none(), lreg, rreg, &reg, false);
107 	if (err) {
108 		nfp_prog->error = err;
109 		return;
110 	}
111 	if (reg.swap) {
112 		pr_err("cmd can't swap arguments\n");
113 		nfp_prog->error = -EFAULT;
114 		return;
115 	}
116 	if (reg.dst_lmextn || reg.src_lmextn) {
117 		pr_err("cmd can't use LMextn\n");
118 		nfp_prog->error = -EFAULT;
119 		return;
120 	}
121 
122 	__emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, ctx,
123 		   indir);
124 }
125 
126 static void
127 emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
128 	 swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
129 {
130 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, false);
131 }
132 
133 static void
134 emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
135 	       swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
136 {
137 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, true);
138 }
139 
140 static void
141 __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
142 	  enum br_ctx_signal_state css, u16 addr, u8 defer)
143 {
144 	u16 addr_lo, addr_hi;
145 	u64 insn;
146 
147 	addr_lo = addr & (OP_BR_ADDR_LO >> __bf_shf(OP_BR_ADDR_LO));
148 	addr_hi = addr != addr_lo;
149 
150 	insn = OP_BR_BASE |
151 		FIELD_PREP(OP_BR_MASK, mask) |
152 		FIELD_PREP(OP_BR_EV_PIP, ev_pip) |
153 		FIELD_PREP(OP_BR_CSS, css) |
154 		FIELD_PREP(OP_BR_DEFBR, defer) |
155 		FIELD_PREP(OP_BR_ADDR_LO, addr_lo) |
156 		FIELD_PREP(OP_BR_ADDR_HI, addr_hi);
157 
158 	nfp_prog_push(nfp_prog, insn);
159 }
160 
161 static void
162 emit_br_relo(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer,
163 	     enum nfp_relo_type relo)
164 {
165 	if (mask == BR_UNC && defer > 2) {
166 		pr_err("BUG: branch defer out of bounds %d\n", defer);
167 		nfp_prog->error = -EFAULT;
168 		return;
169 	}
170 
171 	__emit_br(nfp_prog, mask,
172 		  mask != BR_UNC ? BR_EV_PIP_COND : BR_EV_PIP_UNCOND,
173 		  BR_CSS_NONE, addr, defer);
174 
175 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
176 		FIELD_PREP(OP_RELO_TYPE, relo);
177 }
178 
179 static void
180 emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
181 {
182 	emit_br_relo(nfp_prog, mask, addr, defer, RELO_BR_REL);
183 }
184 
185 static void
186 __emit_br_bit(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 addr, u8 defer,
187 	      bool set, bool src_lmextn)
188 {
189 	u16 addr_lo, addr_hi;
190 	u64 insn;
191 
192 	addr_lo = addr & (OP_BR_BIT_ADDR_LO >> __bf_shf(OP_BR_BIT_ADDR_LO));
193 	addr_hi = addr != addr_lo;
194 
195 	insn = OP_BR_BIT_BASE |
196 		FIELD_PREP(OP_BR_BIT_A_SRC, areg) |
197 		FIELD_PREP(OP_BR_BIT_B_SRC, breg) |
198 		FIELD_PREP(OP_BR_BIT_BV, set) |
199 		FIELD_PREP(OP_BR_BIT_DEFBR, defer) |
200 		FIELD_PREP(OP_BR_BIT_ADDR_LO, addr_lo) |
201 		FIELD_PREP(OP_BR_BIT_ADDR_HI, addr_hi) |
202 		FIELD_PREP(OP_BR_BIT_SRC_LMEXTN, src_lmextn);
203 
204 	nfp_prog_push(nfp_prog, insn);
205 }
206 
207 static void
208 emit_br_bit_relo(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr,
209 		 u8 defer, bool set, enum nfp_relo_type relo)
210 {
211 	struct nfp_insn_re_regs reg;
212 	int err;
213 
214 	/* NOTE: The bit to test is specified as an rotation amount, such that
215 	 *	 the bit to test will be placed on the MSB of the result when
216 	 *	 doing a rotate right. For bit X, we need right rotate X + 1.
217 	 */
218 	bit += 1;
219 
220 	err = swreg_to_restricted(reg_none(), src, reg_imm(bit), &reg, false);
221 	if (err) {
222 		nfp_prog->error = err;
223 		return;
224 	}
225 
226 	__emit_br_bit(nfp_prog, reg.areg, reg.breg, addr, defer, set,
227 		      reg.src_lmextn);
228 
229 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
230 		FIELD_PREP(OP_RELO_TYPE, relo);
231 }
232 
233 static void
234 emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer)
235 {
236 	emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL);
237 }
238 
239 static void
240 __emit_br_alu(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
241 	      u8 defer, bool dst_lmextn, bool src_lmextn)
242 {
243 	u64 insn;
244 
245 	insn = OP_BR_ALU_BASE |
246 		FIELD_PREP(OP_BR_ALU_A_SRC, areg) |
247 		FIELD_PREP(OP_BR_ALU_B_SRC, breg) |
248 		FIELD_PREP(OP_BR_ALU_DEFBR, defer) |
249 		FIELD_PREP(OP_BR_ALU_IMM_HI, imm_hi) |
250 		FIELD_PREP(OP_BR_ALU_SRC_LMEXTN, src_lmextn) |
251 		FIELD_PREP(OP_BR_ALU_DST_LMEXTN, dst_lmextn);
252 
253 	nfp_prog_push(nfp_prog, insn);
254 }
255 
256 static void emit_rtn(struct nfp_prog *nfp_prog, swreg base, u8 defer)
257 {
258 	struct nfp_insn_ur_regs reg;
259 	int err;
260 
261 	err = swreg_to_unrestricted(reg_none(), base, reg_imm(0), &reg);
262 	if (err) {
263 		nfp_prog->error = err;
264 		return;
265 	}
266 
267 	__emit_br_alu(nfp_prog, reg.areg, reg.breg, 0, defer, reg.dst_lmextn,
268 		      reg.src_lmextn);
269 }
270 
271 static void
272 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
273 	     enum immed_width width, bool invert,
274 	     enum immed_shift shift, bool wr_both,
275 	     bool dst_lmextn, bool src_lmextn)
276 {
277 	u64 insn;
278 
279 	insn = OP_IMMED_BASE |
280 		FIELD_PREP(OP_IMMED_A_SRC, areg) |
281 		FIELD_PREP(OP_IMMED_B_SRC, breg) |
282 		FIELD_PREP(OP_IMMED_IMM, imm_hi) |
283 		FIELD_PREP(OP_IMMED_WIDTH, width) |
284 		FIELD_PREP(OP_IMMED_INV, invert) |
285 		FIELD_PREP(OP_IMMED_SHIFT, shift) |
286 		FIELD_PREP(OP_IMMED_WR_AB, wr_both) |
287 		FIELD_PREP(OP_IMMED_SRC_LMEXTN, src_lmextn) |
288 		FIELD_PREP(OP_IMMED_DST_LMEXTN, dst_lmextn);
289 
290 	nfp_prog_push(nfp_prog, insn);
291 }
292 
293 static void
294 emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm,
295 	   enum immed_width width, bool invert, enum immed_shift shift)
296 {
297 	struct nfp_insn_ur_regs reg;
298 	int err;
299 
300 	if (swreg_type(dst) == NN_REG_IMM) {
301 		nfp_prog->error = -EFAULT;
302 		return;
303 	}
304 
305 	err = swreg_to_unrestricted(dst, dst, reg_imm(imm & 0xff), &reg);
306 	if (err) {
307 		nfp_prog->error = err;
308 		return;
309 	}
310 
311 	/* Use reg.dst when destination is No-Dest. */
312 	__emit_immed(nfp_prog,
313 		     swreg_type(dst) == NN_REG_NONE ? reg.dst : reg.areg,
314 		     reg.breg, imm >> 8, width, invert, shift,
315 		     reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
316 }
317 
318 static void
319 __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
320 	   enum shf_sc sc, u8 shift,
321 	   u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both,
322 	   bool dst_lmextn, bool src_lmextn)
323 {
324 	u64 insn;
325 
326 	if (!FIELD_FIT(OP_SHF_SHIFT, shift)) {
327 		nfp_prog->error = -EFAULT;
328 		return;
329 	}
330 
331 	/* NFP shift instruction has something special. If shift direction is
332 	 * left then shift amount of 1 to 31 is specified as 32 minus the amount
333 	 * to shift.
334 	 *
335 	 * But no need to do this for indirect shift which has shift amount be
336 	 * 0. Even after we do this subtraction, shift amount 0 will be turned
337 	 * into 32 which will eventually be encoded the same as 0 because only
338 	 * low 5 bits are encoded, but shift amount be 32 will fail the
339 	 * FIELD_PREP check done later on shift mask (0x1f), due to 32 is out of
340 	 * mask range.
341 	 */
342 	if (sc == SHF_SC_L_SHF && shift)
343 		shift = 32 - shift;
344 
345 	insn = OP_SHF_BASE |
346 		FIELD_PREP(OP_SHF_A_SRC, areg) |
347 		FIELD_PREP(OP_SHF_SC, sc) |
348 		FIELD_PREP(OP_SHF_B_SRC, breg) |
349 		FIELD_PREP(OP_SHF_I8, i8) |
350 		FIELD_PREP(OP_SHF_SW, sw) |
351 		FIELD_PREP(OP_SHF_DST, dst) |
352 		FIELD_PREP(OP_SHF_SHIFT, shift) |
353 		FIELD_PREP(OP_SHF_OP, op) |
354 		FIELD_PREP(OP_SHF_DST_AB, dst_ab) |
355 		FIELD_PREP(OP_SHF_WR_AB, wr_both) |
356 		FIELD_PREP(OP_SHF_SRC_LMEXTN, src_lmextn) |
357 		FIELD_PREP(OP_SHF_DST_LMEXTN, dst_lmextn);
358 
359 	nfp_prog_push(nfp_prog, insn);
360 }
361 
362 static void
363 emit_shf(struct nfp_prog *nfp_prog, swreg dst,
364 	 swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc, u8 shift)
365 {
366 	struct nfp_insn_re_regs reg;
367 	int err;
368 
369 	err = swreg_to_restricted(dst, lreg, rreg, &reg, true);
370 	if (err) {
371 		nfp_prog->error = err;
372 		return;
373 	}
374 
375 	__emit_shf(nfp_prog, reg.dst, reg.dst_ab, sc, shift,
376 		   reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both,
377 		   reg.dst_lmextn, reg.src_lmextn);
378 }
379 
380 static void
381 emit_shf_indir(struct nfp_prog *nfp_prog, swreg dst,
382 	       swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc)
383 {
384 	if (sc == SHF_SC_R_ROT) {
385 		pr_err("indirect shift is not allowed on rotation\n");
386 		nfp_prog->error = -EFAULT;
387 		return;
388 	}
389 
390 	emit_shf(nfp_prog, dst, lreg, op, rreg, sc, 0);
391 }
392 
393 static void
394 __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
395 	   u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
396 	   bool dst_lmextn, bool src_lmextn)
397 {
398 	u64 insn;
399 
400 	insn = OP_ALU_BASE |
401 		FIELD_PREP(OP_ALU_A_SRC, areg) |
402 		FIELD_PREP(OP_ALU_B_SRC, breg) |
403 		FIELD_PREP(OP_ALU_DST, dst) |
404 		FIELD_PREP(OP_ALU_SW, swap) |
405 		FIELD_PREP(OP_ALU_OP, op) |
406 		FIELD_PREP(OP_ALU_DST_AB, dst_ab) |
407 		FIELD_PREP(OP_ALU_WR_AB, wr_both) |
408 		FIELD_PREP(OP_ALU_SRC_LMEXTN, src_lmextn) |
409 		FIELD_PREP(OP_ALU_DST_LMEXTN, dst_lmextn);
410 
411 	nfp_prog_push(nfp_prog, insn);
412 }
413 
414 static void
415 emit_alu(struct nfp_prog *nfp_prog, swreg dst,
416 	 swreg lreg, enum alu_op op, swreg rreg)
417 {
418 	struct nfp_insn_ur_regs reg;
419 	int err;
420 
421 	err = swreg_to_unrestricted(dst, lreg, rreg, &reg);
422 	if (err) {
423 		nfp_prog->error = err;
424 		return;
425 	}
426 
427 	__emit_alu(nfp_prog, reg.dst, reg.dst_ab,
428 		   reg.areg, op, reg.breg, reg.swap, reg.wr_both,
429 		   reg.dst_lmextn, reg.src_lmextn);
430 }
431 
432 static void
433 __emit_mul(struct nfp_prog *nfp_prog, enum alu_dst_ab dst_ab, u16 areg,
434 	   enum mul_type type, enum mul_step step, u16 breg, bool swap,
435 	   bool wr_both, bool dst_lmextn, bool src_lmextn)
436 {
437 	u64 insn;
438 
439 	insn = OP_MUL_BASE |
440 		FIELD_PREP(OP_MUL_A_SRC, areg) |
441 		FIELD_PREP(OP_MUL_B_SRC, breg) |
442 		FIELD_PREP(OP_MUL_STEP, step) |
443 		FIELD_PREP(OP_MUL_DST_AB, dst_ab) |
444 		FIELD_PREP(OP_MUL_SW, swap) |
445 		FIELD_PREP(OP_MUL_TYPE, type) |
446 		FIELD_PREP(OP_MUL_WR_AB, wr_both) |
447 		FIELD_PREP(OP_MUL_SRC_LMEXTN, src_lmextn) |
448 		FIELD_PREP(OP_MUL_DST_LMEXTN, dst_lmextn);
449 
450 	nfp_prog_push(nfp_prog, insn);
451 }
452 
453 static void
454 emit_mul(struct nfp_prog *nfp_prog, swreg lreg, enum mul_type type,
455 	 enum mul_step step, swreg rreg)
456 {
457 	struct nfp_insn_ur_regs reg;
458 	u16 areg;
459 	int err;
460 
461 	if (type == MUL_TYPE_START && step != MUL_STEP_NONE) {
462 		nfp_prog->error = -EINVAL;
463 		return;
464 	}
465 
466 	if (step == MUL_LAST || step == MUL_LAST_2) {
467 		/* When type is step and step Number is LAST or LAST2, left
468 		 * source is used as destination.
469 		 */
470 		err = swreg_to_unrestricted(lreg, reg_none(), rreg, &reg);
471 		areg = reg.dst;
472 	} else {
473 		err = swreg_to_unrestricted(reg_none(), lreg, rreg, &reg);
474 		areg = reg.areg;
475 	}
476 
477 	if (err) {
478 		nfp_prog->error = err;
479 		return;
480 	}
481 
482 	__emit_mul(nfp_prog, reg.dst_ab, areg, type, step, reg.breg, reg.swap,
483 		   reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
484 }
485 
486 static void
487 __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
488 		u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8,
489 		bool zero, bool swap, bool wr_both,
490 		bool dst_lmextn, bool src_lmextn)
491 {
492 	u64 insn;
493 
494 	insn = OP_LDF_BASE |
495 		FIELD_PREP(OP_LDF_A_SRC, areg) |
496 		FIELD_PREP(OP_LDF_SC, sc) |
497 		FIELD_PREP(OP_LDF_B_SRC, breg) |
498 		FIELD_PREP(OP_LDF_I8, imm8) |
499 		FIELD_PREP(OP_LDF_SW, swap) |
500 		FIELD_PREP(OP_LDF_ZF, zero) |
501 		FIELD_PREP(OP_LDF_BMASK, bmask) |
502 		FIELD_PREP(OP_LDF_SHF, shift) |
503 		FIELD_PREP(OP_LDF_WR_AB, wr_both) |
504 		FIELD_PREP(OP_LDF_SRC_LMEXTN, src_lmextn) |
505 		FIELD_PREP(OP_LDF_DST_LMEXTN, dst_lmextn);
506 
507 	nfp_prog_push(nfp_prog, insn);
508 }
509 
510 static void
511 emit_ld_field_any(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
512 		  enum shf_sc sc, u8 shift, bool zero)
513 {
514 	struct nfp_insn_re_regs reg;
515 	int err;
516 
517 	/* Note: ld_field is special as it uses one of the src regs as dst */
518 	err = swreg_to_restricted(dst, dst, src, &reg, true);
519 	if (err) {
520 		nfp_prog->error = err;
521 		return;
522 	}
523 
524 	__emit_ld_field(nfp_prog, sc, reg.areg, bmask, reg.breg, shift,
525 			reg.i8, zero, reg.swap, reg.wr_both,
526 			reg.dst_lmextn, reg.src_lmextn);
527 }
528 
529 static void
530 emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
531 	      enum shf_sc sc, u8 shift)
532 {
533 	emit_ld_field_any(nfp_prog, dst, bmask, src, sc, shift, false);
534 }
535 
536 static void
537 __emit_lcsr(struct nfp_prog *nfp_prog, u16 areg, u16 breg, bool wr, u16 addr,
538 	    bool dst_lmextn, bool src_lmextn)
539 {
540 	u64 insn;
541 
542 	insn = OP_LCSR_BASE |
543 		FIELD_PREP(OP_LCSR_A_SRC, areg) |
544 		FIELD_PREP(OP_LCSR_B_SRC, breg) |
545 		FIELD_PREP(OP_LCSR_WRITE, wr) |
546 		FIELD_PREP(OP_LCSR_ADDR, addr / 4) |
547 		FIELD_PREP(OP_LCSR_SRC_LMEXTN, src_lmextn) |
548 		FIELD_PREP(OP_LCSR_DST_LMEXTN, dst_lmextn);
549 
550 	nfp_prog_push(nfp_prog, insn);
551 }
552 
553 static void emit_csr_wr(struct nfp_prog *nfp_prog, swreg src, u16 addr)
554 {
555 	struct nfp_insn_ur_regs reg;
556 	int err;
557 
558 	/* This instruction takes immeds instead of reg_none() for the ignored
559 	 * operand, but we can't encode 2 immeds in one instr with our normal
560 	 * swreg infra so if param is an immed, we encode as reg_none() and
561 	 * copy the immed to both operands.
562 	 */
563 	if (swreg_type(src) == NN_REG_IMM) {
564 		err = swreg_to_unrestricted(reg_none(), src, reg_none(), &reg);
565 		reg.breg = reg.areg;
566 	} else {
567 		err = swreg_to_unrestricted(reg_none(), src, reg_imm(0), &reg);
568 	}
569 	if (err) {
570 		nfp_prog->error = err;
571 		return;
572 	}
573 
574 	__emit_lcsr(nfp_prog, reg.areg, reg.breg, true, addr,
575 		    false, reg.src_lmextn);
576 }
577 
578 /* CSR value is read in following immed[gpr, 0] */
579 static void __emit_csr_rd(struct nfp_prog *nfp_prog, u16 addr)
580 {
581 	__emit_lcsr(nfp_prog, 0, 0, false, addr, false, false);
582 }
583 
584 static void emit_nop(struct nfp_prog *nfp_prog)
585 {
586 	__emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0);
587 }
588 
589 /* --- Wrappers --- */
590 static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift)
591 {
592 	if (!(imm & 0xffff0000)) {
593 		*val = imm;
594 		*shift = IMMED_SHIFT_0B;
595 	} else if (!(imm & 0xff0000ff)) {
596 		*val = imm >> 8;
597 		*shift = IMMED_SHIFT_1B;
598 	} else if (!(imm & 0x0000ffff)) {
599 		*val = imm >> 16;
600 		*shift = IMMED_SHIFT_2B;
601 	} else {
602 		return false;
603 	}
604 
605 	return true;
606 }
607 
608 static void wrp_immed(struct nfp_prog *nfp_prog, swreg dst, u32 imm)
609 {
610 	enum immed_shift shift;
611 	u16 val;
612 
613 	if (pack_immed(imm, &val, &shift)) {
614 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, false, shift);
615 	} else if (pack_immed(~imm, &val, &shift)) {
616 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, true, shift);
617 	} else {
618 		emit_immed(nfp_prog, dst, imm & 0xffff, IMMED_WIDTH_ALL,
619 			   false, IMMED_SHIFT_0B);
620 		emit_immed(nfp_prog, dst, imm >> 16, IMMED_WIDTH_WORD,
621 			   false, IMMED_SHIFT_2B);
622 	}
623 }
624 
625 static void
626 wrp_zext(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst)
627 {
628 	if (meta->flags & FLAG_INSN_DO_ZEXT)
629 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
630 }
631 
632 static void
633 wrp_immed_relo(struct nfp_prog *nfp_prog, swreg dst, u32 imm,
634 	       enum nfp_relo_type relo)
635 {
636 	if (imm > 0xffff) {
637 		pr_err("relocation of a large immediate!\n");
638 		nfp_prog->error = -EFAULT;
639 		return;
640 	}
641 	emit_immed(nfp_prog, dst, imm, IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
642 
643 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
644 		FIELD_PREP(OP_RELO_TYPE, relo);
645 }
646 
647 /* ur_load_imm_any() - encode immediate or use tmp register (unrestricted)
648  * If the @imm is small enough encode it directly in operand and return
649  * otherwise load @imm to a spare register and return its encoding.
650  */
651 static swreg ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
652 {
653 	if (FIELD_FIT(UR_REG_IMM_MAX, imm))
654 		return reg_imm(imm);
655 
656 	wrp_immed(nfp_prog, tmp_reg, imm);
657 	return tmp_reg;
658 }
659 
660 /* re_load_imm_any() - encode immediate or use tmp register (restricted)
661  * If the @imm is small enough encode it directly in operand and return
662  * otherwise load @imm to a spare register and return its encoding.
663  */
664 static swreg re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
665 {
666 	if (FIELD_FIT(RE_REG_IMM_MAX, imm))
667 		return reg_imm(imm);
668 
669 	wrp_immed(nfp_prog, tmp_reg, imm);
670 	return tmp_reg;
671 }
672 
673 static void wrp_nops(struct nfp_prog *nfp_prog, unsigned int count)
674 {
675 	while (count--)
676 		emit_nop(nfp_prog);
677 }
678 
679 static void wrp_mov(struct nfp_prog *nfp_prog, swreg dst, swreg src)
680 {
681 	emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, src);
682 }
683 
684 static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src)
685 {
686 	wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
687 }
688 
689 /* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
690  * result to @dst from low end.
691  */
692 static void
693 wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
694 		u8 offset)
695 {
696 	enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
697 	u8 mask = (1 << field_len) - 1;
698 
699 	emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
700 }
701 
702 /* wrp_reg_or_subpart() - load @field_len bytes from low end of @src, or the
703  * result to @dst from offset, there is no change on the other bits of @dst.
704  */
705 static void
706 wrp_reg_or_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src,
707 		   u8 field_len, u8 offset)
708 {
709 	enum shf_sc sc = offset ? SHF_SC_L_SHF : SHF_SC_NONE;
710 	u8 mask = ((1 << field_len) - 1) << offset;
711 
712 	emit_ld_field(nfp_prog, dst, mask, src, sc, 32 - offset * 8);
713 }
714 
715 static void
716 addr40_offset(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
717 	      swreg *rega, swreg *regb)
718 {
719 	if (offset == reg_imm(0)) {
720 		*rega = reg_a(src_gpr);
721 		*regb = reg_b(src_gpr + 1);
722 		return;
723 	}
724 
725 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(src_gpr), ALU_OP_ADD, offset);
726 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_b(src_gpr + 1), ALU_OP_ADD_C,
727 		 reg_imm(0));
728 	*rega = imm_a(nfp_prog);
729 	*regb = imm_b(nfp_prog);
730 }
731 
732 /* NFP has Command Push Pull bus which supports bluk memory operations. */
733 static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
734 {
735 	bool descending_seq = meta->ldst_gather_len < 0;
736 	s16 len = abs(meta->ldst_gather_len);
737 	swreg src_base, off;
738 	bool src_40bit_addr;
739 	unsigned int i;
740 	u8 xfer_num;
741 
742 	off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
743 	src_40bit_addr = meta->ptr.type == PTR_TO_MAP_VALUE;
744 	src_base = reg_a(meta->insn.src_reg * 2);
745 	xfer_num = round_up(len, 4) / 4;
746 
747 	if (src_40bit_addr)
748 		addr40_offset(nfp_prog, meta->insn.src_reg * 2, off, &src_base,
749 			      &off);
750 
751 	/* Setup PREV_ALU fields to override memory read length. */
752 	if (len > 32)
753 		wrp_immed(nfp_prog, reg_none(),
754 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
755 
756 	/* Memory read from source addr into transfer-in registers. */
757 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP,
758 		     src_40bit_addr ? CMD_MODE_40b_BA : CMD_MODE_32b, 0,
759 		     src_base, off, xfer_num - 1, CMD_CTX_SWAP, len > 32);
760 
761 	/* Move from transfer-in to transfer-out. */
762 	for (i = 0; i < xfer_num; i++)
763 		wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
764 
765 	off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
766 
767 	if (len <= 8) {
768 		/* Use single direct_ref write8. */
769 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
770 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
771 			 CMD_CTX_SWAP);
772 	} else if (len <= 32 && IS_ALIGNED(len, 4)) {
773 		/* Use single direct_ref write32. */
774 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
775 			 reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
776 			 CMD_CTX_SWAP);
777 	} else if (len <= 32) {
778 		/* Use single indirect_ref write8. */
779 		wrp_immed(nfp_prog, reg_none(),
780 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
781 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
782 			       reg_a(meta->paired_st->dst_reg * 2), off,
783 			       len - 1, CMD_CTX_SWAP);
784 	} else if (IS_ALIGNED(len, 4)) {
785 		/* Use single indirect_ref write32. */
786 		wrp_immed(nfp_prog, reg_none(),
787 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
788 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
789 			       reg_a(meta->paired_st->dst_reg * 2), off,
790 			       xfer_num - 1, CMD_CTX_SWAP);
791 	} else if (len <= 40) {
792 		/* Use one direct_ref write32 to write the first 32-bytes, then
793 		 * another direct_ref write8 to write the remaining bytes.
794 		 */
795 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
796 			 reg_a(meta->paired_st->dst_reg * 2), off, 7,
797 			 CMD_CTX_SWAP);
798 
799 		off = re_load_imm_any(nfp_prog, meta->paired_st->off + 32,
800 				      imm_b(nfp_prog));
801 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 8,
802 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 33,
803 			 CMD_CTX_SWAP);
804 	} else {
805 		/* Use one indirect_ref write32 to write 4-bytes aligned length,
806 		 * then another direct_ref write8 to write the remaining bytes.
807 		 */
808 		u8 new_off;
809 
810 		wrp_immed(nfp_prog, reg_none(),
811 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 2));
812 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
813 			       reg_a(meta->paired_st->dst_reg * 2), off,
814 			       xfer_num - 2, CMD_CTX_SWAP);
815 		new_off = meta->paired_st->off + (xfer_num - 1) * 4;
816 		off = re_load_imm_any(nfp_prog, new_off, imm_b(nfp_prog));
817 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b,
818 			 xfer_num - 1, reg_a(meta->paired_st->dst_reg * 2), off,
819 			 (len & 0x3) - 1, CMD_CTX_SWAP);
820 	}
821 
822 	/* TODO: The following extra load is to make sure data flow be identical
823 	 *  before and after we do memory copy optimization.
824 	 *
825 	 *  The load destination register is not guaranteed to be dead, so we
826 	 *  need to make sure it is loaded with the value the same as before
827 	 *  this transformation.
828 	 *
829 	 *  These extra loads could be removed once we have accurate register
830 	 *  usage information.
831 	 */
832 	if (descending_seq)
833 		xfer_num = 0;
834 	else if (BPF_SIZE(meta->insn.code) != BPF_DW)
835 		xfer_num = xfer_num - 1;
836 	else
837 		xfer_num = xfer_num - 2;
838 
839 	switch (BPF_SIZE(meta->insn.code)) {
840 	case BPF_B:
841 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
842 				reg_xfer(xfer_num), 1,
843 				IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1);
844 		break;
845 	case BPF_H:
846 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
847 				reg_xfer(xfer_num), 2, (len & 3) ^ 2);
848 		break;
849 	case BPF_W:
850 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
851 			reg_xfer(0));
852 		break;
853 	case BPF_DW:
854 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
855 			reg_xfer(xfer_num));
856 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1),
857 			reg_xfer(xfer_num + 1));
858 		break;
859 	}
860 
861 	if (BPF_SIZE(meta->insn.code) != BPF_DW)
862 		wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
863 
864 	return 0;
865 }
866 
867 static int
868 data_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, swreg offset,
869 	u8 dst_gpr, int size)
870 {
871 	unsigned int i;
872 	u16 shift, sz;
873 
874 	/* We load the value from the address indicated in @offset and then
875 	 * shift out the data we don't need.  Note: this is big endian!
876 	 */
877 	sz = max(size, 4);
878 	shift = size < 4 ? 4 - size : 0;
879 
880 	emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0,
881 		 pptr_reg(nfp_prog), offset, sz - 1, CMD_CTX_SWAP);
882 
883 	i = 0;
884 	if (shift)
885 		emit_shf(nfp_prog, reg_both(dst_gpr), reg_none(), SHF_OP_NONE,
886 			 reg_xfer(0), SHF_SC_R_SHF, shift * 8);
887 	else
888 		for (; i * 4 < size; i++)
889 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
890 
891 	if (i < 2)
892 		wrp_zext(nfp_prog, meta, dst_gpr);
893 
894 	return 0;
895 }
896 
897 static int
898 data_ld_host_order(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
899 		   u8 dst_gpr, swreg lreg, swreg rreg, int size,
900 		   enum cmd_mode mode)
901 {
902 	unsigned int i;
903 	u8 mask, sz;
904 
905 	/* We load the value from the address indicated in rreg + lreg and then
906 	 * mask out the data we don't need.  Note: this is little endian!
907 	 */
908 	sz = max(size, 4);
909 	mask = size < 4 ? GENMASK(size - 1, 0) : 0;
910 
911 	emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, mode, 0,
912 		 lreg, rreg, sz / 4 - 1, CMD_CTX_SWAP);
913 
914 	i = 0;
915 	if (mask)
916 		emit_ld_field_any(nfp_prog, reg_both(dst_gpr), mask,
917 				  reg_xfer(0), SHF_SC_NONE, 0, true);
918 	else
919 		for (; i * 4 < size; i++)
920 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
921 
922 	if (i < 2)
923 		wrp_zext(nfp_prog, meta, dst_gpr);
924 
925 	return 0;
926 }
927 
928 static int
929 data_ld_host_order_addr32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
930 			  u8 src_gpr, swreg offset, u8 dst_gpr, u8 size)
931 {
932 	return data_ld_host_order(nfp_prog, meta, dst_gpr, reg_a(src_gpr),
933 				  offset, size, CMD_MODE_32b);
934 }
935 
936 static int
937 data_ld_host_order_addr40(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
938 			  u8 src_gpr, swreg offset, u8 dst_gpr, u8 size)
939 {
940 	swreg rega, regb;
941 
942 	addr40_offset(nfp_prog, src_gpr, offset, &rega, &regb);
943 
944 	return data_ld_host_order(nfp_prog, meta, dst_gpr, rega, regb,
945 				  size, CMD_MODE_40b_BA);
946 }
947 
948 static int
949 construct_data_ind_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
950 		      u16 offset, u16 src, u8 size)
951 {
952 	swreg tmp_reg;
953 
954 	/* Calculate the true offset (src_reg + imm) */
955 	tmp_reg = ur_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
956 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_a(src), ALU_OP_ADD, tmp_reg);
957 
958 	/* Check packet length (size guaranteed to fit b/c it's u8) */
959 	emit_alu(nfp_prog, imm_a(nfp_prog),
960 		 imm_a(nfp_prog), ALU_OP_ADD, reg_imm(size));
961 	emit_alu(nfp_prog, reg_none(),
962 		 plen_reg(nfp_prog), ALU_OP_SUB, imm_a(nfp_prog));
963 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
964 
965 	/* Load data */
966 	return data_ld(nfp_prog, meta, imm_b(nfp_prog), 0, size);
967 }
968 
969 static int
970 construct_data_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
971 		  u16 offset, u8 size)
972 {
973 	swreg tmp_reg;
974 
975 	/* Check packet length */
976 	tmp_reg = ur_load_imm_any(nfp_prog, offset + size, imm_a(nfp_prog));
977 	emit_alu(nfp_prog, reg_none(), plen_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
978 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
979 
980 	/* Load data */
981 	tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
982 	return data_ld(nfp_prog, meta, tmp_reg, 0, size);
983 }
984 
985 static int
986 data_stx_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
987 		    u8 src_gpr, u8 size)
988 {
989 	unsigned int i;
990 
991 	for (i = 0; i * 4 < size; i++)
992 		wrp_mov(nfp_prog, reg_xfer(i), reg_a(src_gpr + i));
993 
994 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
995 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
996 
997 	return 0;
998 }
999 
1000 static int
1001 data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
1002 		   u64 imm, u8 size)
1003 {
1004 	wrp_immed(nfp_prog, reg_xfer(0), imm);
1005 	if (size == 8)
1006 		wrp_immed(nfp_prog, reg_xfer(1), imm >> 32);
1007 
1008 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
1009 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
1010 
1011 	return 0;
1012 }
1013 
1014 typedef int
1015 (*lmem_step)(struct nfp_prog *nfp_prog, u8 gpr, u8 gpr_byte, s32 off,
1016 	     unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1017 	     bool needs_inc);
1018 
1019 static int
1020 wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
1021 	      unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1022 	      bool needs_inc)
1023 {
1024 	bool should_inc = needs_inc && new_gpr && !last;
1025 	u32 idx, src_byte;
1026 	enum shf_sc sc;
1027 	swreg reg;
1028 	int shf;
1029 	u8 mask;
1030 
1031 	if (WARN_ON_ONCE(dst_byte + size > 4 || off % 4 + size > 4))
1032 		return -EOPNOTSUPP;
1033 
1034 	idx = off / 4;
1035 
1036 	/* Move the entire word */
1037 	if (size == 4) {
1038 		wrp_mov(nfp_prog, reg_both(dst),
1039 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx));
1040 		return 0;
1041 	}
1042 
1043 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1044 		return -EOPNOTSUPP;
1045 
1046 	src_byte = off % 4;
1047 
1048 	mask = (1 << size) - 1;
1049 	mask <<= dst_byte;
1050 
1051 	if (WARN_ON_ONCE(mask > 0xf))
1052 		return -EOPNOTSUPP;
1053 
1054 	shf = abs(src_byte - dst_byte) * 8;
1055 	if (src_byte == dst_byte) {
1056 		sc = SHF_SC_NONE;
1057 	} else if (src_byte < dst_byte) {
1058 		shf = 32 - shf;
1059 		sc = SHF_SC_L_SHF;
1060 	} else {
1061 		sc = SHF_SC_R_SHF;
1062 	}
1063 
1064 	/* ld_field can address fewer indexes, if offset too large do RMW.
1065 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1066 	 */
1067 	if (idx <= RE_REG_LM_IDX_MAX) {
1068 		reg = reg_lm(lm3 ? 3 : 0, idx);
1069 	} else {
1070 		reg = imm_a(nfp_prog);
1071 		/* If it's not the first part of the load and we start a new GPR
1072 		 * that means we are loading a second part of the LMEM word into
1073 		 * a new GPR.  IOW we've already looked that LMEM word and
1074 		 * therefore it has been loaded into imm_a().
1075 		 */
1076 		if (first || !new_gpr)
1077 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1078 	}
1079 
1080 	emit_ld_field_any(nfp_prog, reg_both(dst), mask, reg, sc, shf, new_gpr);
1081 
1082 	if (should_inc)
1083 		wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1084 
1085 	return 0;
1086 }
1087 
1088 static int
1089 wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
1090 	       unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1091 	       bool needs_inc)
1092 {
1093 	bool should_inc = needs_inc && new_gpr && !last;
1094 	u32 idx, dst_byte;
1095 	enum shf_sc sc;
1096 	swreg reg;
1097 	int shf;
1098 	u8 mask;
1099 
1100 	if (WARN_ON_ONCE(src_byte + size > 4 || off % 4 + size > 4))
1101 		return -EOPNOTSUPP;
1102 
1103 	idx = off / 4;
1104 
1105 	/* Move the entire word */
1106 	if (size == 4) {
1107 		wrp_mov(nfp_prog,
1108 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx),
1109 			reg_b(src));
1110 		return 0;
1111 	}
1112 
1113 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1114 		return -EOPNOTSUPP;
1115 
1116 	dst_byte = off % 4;
1117 
1118 	mask = (1 << size) - 1;
1119 	mask <<= dst_byte;
1120 
1121 	if (WARN_ON_ONCE(mask > 0xf))
1122 		return -EOPNOTSUPP;
1123 
1124 	shf = abs(src_byte - dst_byte) * 8;
1125 	if (src_byte == dst_byte) {
1126 		sc = SHF_SC_NONE;
1127 	} else if (src_byte < dst_byte) {
1128 		shf = 32 - shf;
1129 		sc = SHF_SC_L_SHF;
1130 	} else {
1131 		sc = SHF_SC_R_SHF;
1132 	}
1133 
1134 	/* ld_field can address fewer indexes, if offset too large do RMW.
1135 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1136 	 */
1137 	if (idx <= RE_REG_LM_IDX_MAX) {
1138 		reg = reg_lm(lm3 ? 3 : 0, idx);
1139 	} else {
1140 		reg = imm_a(nfp_prog);
1141 		/* Only first and last LMEM locations are going to need RMW,
1142 		 * the middle location will be overwritten fully.
1143 		 */
1144 		if (first || last)
1145 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1146 	}
1147 
1148 	emit_ld_field(nfp_prog, reg, mask, reg_b(src), sc, shf);
1149 
1150 	if (new_gpr || last) {
1151 		if (idx > RE_REG_LM_IDX_MAX)
1152 			wrp_mov(nfp_prog, reg_lm(0, idx), reg);
1153 		if (should_inc)
1154 			wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1155 	}
1156 
1157 	return 0;
1158 }
1159 
1160 static int
1161 mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1162 	     unsigned int size, unsigned int ptr_off, u8 gpr, u8 ptr_gpr,
1163 	     bool clr_gpr, lmem_step step)
1164 {
1165 	s32 off = nfp_prog->stack_frame_depth + meta->insn.off + ptr_off;
1166 	bool first = true, narrow_ld, last;
1167 	bool needs_inc = false;
1168 	swreg stack_off_reg;
1169 	u8 prev_gpr = 255;
1170 	u32 gpr_byte = 0;
1171 	bool lm3 = true;
1172 	int ret;
1173 
1174 	if (meta->ptr_not_const ||
1175 	    meta->flags & FLAG_INSN_PTR_CALLER_STACK_FRAME) {
1176 		/* Use of the last encountered ptr_off is OK, they all have
1177 		 * the same alignment.  Depend on low bits of value being
1178 		 * discarded when written to LMaddr register.
1179 		 */
1180 		stack_off_reg = ur_load_imm_any(nfp_prog, meta->insn.off,
1181 						stack_imm(nfp_prog));
1182 
1183 		emit_alu(nfp_prog, imm_b(nfp_prog),
1184 			 reg_a(ptr_gpr), ALU_OP_ADD, stack_off_reg);
1185 
1186 		needs_inc = true;
1187 	} else if (off + size <= 64) {
1188 		/* We can reach bottom 64B with LMaddr0 */
1189 		lm3 = false;
1190 	} else if (round_down(off, 32) == round_down(off + size - 1, 32)) {
1191 		/* We have to set up a new pointer.  If we know the offset
1192 		 * and the entire access falls into a single 32 byte aligned
1193 		 * window we won't have to increment the LM pointer.
1194 		 * The 32 byte alignment is imporant because offset is ORed in
1195 		 * not added when doing *l$indexN[off].
1196 		 */
1197 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 32),
1198 						stack_imm(nfp_prog));
1199 		emit_alu(nfp_prog, imm_b(nfp_prog),
1200 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1201 
1202 		off %= 32;
1203 	} else {
1204 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 4),
1205 						stack_imm(nfp_prog));
1206 
1207 		emit_alu(nfp_prog, imm_b(nfp_prog),
1208 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1209 
1210 		needs_inc = true;
1211 	}
1212 
1213 	narrow_ld = clr_gpr && size < 8;
1214 
1215 	if (lm3) {
1216 		unsigned int nop_cnt;
1217 
1218 		emit_csr_wr(nfp_prog, imm_b(nfp_prog), NFP_CSR_ACT_LM_ADDR3);
1219 		/* For size < 4 one slot will be filled by zeroing of upper,
1220 		 * but be careful, that zeroing could be eliminated by zext
1221 		 * optimization.
1222 		 */
1223 		nop_cnt = narrow_ld && meta->flags & FLAG_INSN_DO_ZEXT ? 2 : 3;
1224 		wrp_nops(nfp_prog, nop_cnt);
1225 	}
1226 
1227 	if (narrow_ld)
1228 		wrp_zext(nfp_prog, meta, gpr);
1229 
1230 	while (size) {
1231 		u32 slice_end;
1232 		u8 slice_size;
1233 
1234 		slice_size = min(size, 4 - gpr_byte);
1235 		slice_end = min(off + slice_size, round_up(off + 1, 4));
1236 		slice_size = slice_end - off;
1237 
1238 		last = slice_size == size;
1239 
1240 		if (needs_inc)
1241 			off %= 4;
1242 
1243 		ret = step(nfp_prog, gpr, gpr_byte, off, slice_size,
1244 			   first, gpr != prev_gpr, last, lm3, needs_inc);
1245 		if (ret)
1246 			return ret;
1247 
1248 		prev_gpr = gpr;
1249 		first = false;
1250 
1251 		gpr_byte += slice_size;
1252 		if (gpr_byte >= 4) {
1253 			gpr_byte -= 4;
1254 			gpr++;
1255 		}
1256 
1257 		size -= slice_size;
1258 		off += slice_size;
1259 	}
1260 
1261 	return 0;
1262 }
1263 
1264 static void
1265 wrp_alu_imm(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u32 imm)
1266 {
1267 	swreg tmp_reg;
1268 
1269 	if (alu_op == ALU_OP_AND) {
1270 		if (!imm)
1271 			wrp_immed(nfp_prog, reg_both(dst), 0);
1272 		if (!imm || !~imm)
1273 			return;
1274 	}
1275 	if (alu_op == ALU_OP_OR) {
1276 		if (!~imm)
1277 			wrp_immed(nfp_prog, reg_both(dst), ~0U);
1278 		if (!imm || !~imm)
1279 			return;
1280 	}
1281 	if (alu_op == ALU_OP_XOR) {
1282 		if (!~imm)
1283 			emit_alu(nfp_prog, reg_both(dst), reg_none(),
1284 				 ALU_OP_NOT, reg_b(dst));
1285 		if (!imm || !~imm)
1286 			return;
1287 	}
1288 
1289 	tmp_reg = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1290 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, tmp_reg);
1291 }
1292 
1293 static int
1294 wrp_alu64_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1295 	      enum alu_op alu_op, bool skip)
1296 {
1297 	const struct bpf_insn *insn = &meta->insn;
1298 	u64 imm = insn->imm; /* sign extend */
1299 
1300 	if (skip) {
1301 		meta->flags |= FLAG_INSN_SKIP_NOOP;
1302 		return 0;
1303 	}
1304 
1305 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, imm & ~0U);
1306 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, alu_op, imm >> 32);
1307 
1308 	return 0;
1309 }
1310 
1311 static int
1312 wrp_alu64_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1313 	      enum alu_op alu_op)
1314 {
1315 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1316 
1317 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1318 	emit_alu(nfp_prog, reg_both(dst + 1),
1319 		 reg_a(dst + 1), alu_op, reg_b(src + 1));
1320 
1321 	return 0;
1322 }
1323 
1324 static int
1325 wrp_alu32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1326 	      enum alu_op alu_op)
1327 {
1328 	const struct bpf_insn *insn = &meta->insn;
1329 	u8 dst = insn->dst_reg * 2;
1330 
1331 	wrp_alu_imm(nfp_prog, dst, alu_op, insn->imm);
1332 	wrp_zext(nfp_prog, meta, dst);
1333 
1334 	return 0;
1335 }
1336 
1337 static int
1338 wrp_alu32_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1339 	      enum alu_op alu_op)
1340 {
1341 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1342 
1343 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1344 	wrp_zext(nfp_prog, meta, dst);
1345 
1346 	return 0;
1347 }
1348 
1349 static void
1350 wrp_test_reg_one(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u8 src,
1351 		 enum br_mask br_mask, u16 off)
1352 {
1353 	emit_alu(nfp_prog, reg_none(), reg_a(dst), alu_op, reg_b(src));
1354 	emit_br(nfp_prog, br_mask, off, 0);
1355 }
1356 
1357 static int
1358 wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1359 	     enum alu_op alu_op, enum br_mask br_mask)
1360 {
1361 	const struct bpf_insn *insn = &meta->insn;
1362 
1363 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2, alu_op,
1364 			 insn->src_reg * 2, br_mask, insn->off);
1365 	if (is_mbpf_jmp64(meta))
1366 		wrp_test_reg_one(nfp_prog, insn->dst_reg * 2 + 1, alu_op,
1367 				 insn->src_reg * 2 + 1, br_mask, insn->off);
1368 
1369 	return 0;
1370 }
1371 
1372 static const struct jmp_code_map {
1373 	enum br_mask br_mask;
1374 	bool swap;
1375 } jmp_code_map[] = {
1376 	[BPF_JGT >> 4]	= { BR_BLO, true },
1377 	[BPF_JGE >> 4]	= { BR_BHS, false },
1378 	[BPF_JLT >> 4]	= { BR_BLO, false },
1379 	[BPF_JLE >> 4]	= { BR_BHS, true },
1380 	[BPF_JSGT >> 4]	= { BR_BLT, true },
1381 	[BPF_JSGE >> 4]	= { BR_BGE, false },
1382 	[BPF_JSLT >> 4]	= { BR_BLT, false },
1383 	[BPF_JSLE >> 4]	= { BR_BGE, true },
1384 };
1385 
1386 static const struct jmp_code_map *nfp_jmp_code_get(struct nfp_insn_meta *meta)
1387 {
1388 	unsigned int op;
1389 
1390 	op = BPF_OP(meta->insn.code) >> 4;
1391 	/* br_mask of 0 is BR_BEQ which we don't use in jump code table */
1392 	if (WARN_ONCE(op >= ARRAY_SIZE(jmp_code_map) ||
1393 		      !jmp_code_map[op].br_mask,
1394 		      "no code found for jump instruction"))
1395 		return NULL;
1396 
1397 	return &jmp_code_map[op];
1398 }
1399 
1400 static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1401 {
1402 	const struct bpf_insn *insn = &meta->insn;
1403 	u64 imm = insn->imm; /* sign extend */
1404 	const struct jmp_code_map *code;
1405 	enum alu_op alu_op, carry_op;
1406 	u8 reg = insn->dst_reg * 2;
1407 	swreg tmp_reg;
1408 
1409 	code = nfp_jmp_code_get(meta);
1410 	if (!code)
1411 		return -EINVAL;
1412 
1413 	alu_op = meta->jump_neg_op ? ALU_OP_ADD : ALU_OP_SUB;
1414 	carry_op = meta->jump_neg_op ? ALU_OP_ADD_C : ALU_OP_SUB_C;
1415 
1416 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
1417 	if (!code->swap)
1418 		emit_alu(nfp_prog, reg_none(), reg_a(reg), alu_op, tmp_reg);
1419 	else
1420 		emit_alu(nfp_prog, reg_none(), tmp_reg, alu_op, reg_a(reg));
1421 
1422 	if (is_mbpf_jmp64(meta)) {
1423 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
1424 		if (!code->swap)
1425 			emit_alu(nfp_prog, reg_none(),
1426 				 reg_a(reg + 1), carry_op, tmp_reg);
1427 		else
1428 			emit_alu(nfp_prog, reg_none(),
1429 				 tmp_reg, carry_op, reg_a(reg + 1));
1430 	}
1431 
1432 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1433 
1434 	return 0;
1435 }
1436 
1437 static int cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1438 {
1439 	const struct bpf_insn *insn = &meta->insn;
1440 	const struct jmp_code_map *code;
1441 	u8 areg, breg;
1442 
1443 	code = nfp_jmp_code_get(meta);
1444 	if (!code)
1445 		return -EINVAL;
1446 
1447 	areg = insn->dst_reg * 2;
1448 	breg = insn->src_reg * 2;
1449 
1450 	if (code->swap) {
1451 		areg ^= breg;
1452 		breg ^= areg;
1453 		areg ^= breg;
1454 	}
1455 
1456 	emit_alu(nfp_prog, reg_none(), reg_a(areg), ALU_OP_SUB, reg_b(breg));
1457 	if (is_mbpf_jmp64(meta))
1458 		emit_alu(nfp_prog, reg_none(),
1459 			 reg_a(areg + 1), ALU_OP_SUB_C, reg_b(breg + 1));
1460 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1461 
1462 	return 0;
1463 }
1464 
1465 static void wrp_end32(struct nfp_prog *nfp_prog, swreg reg_in, u8 gpr_out)
1466 {
1467 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0xf, reg_in,
1468 		      SHF_SC_R_ROT, 8);
1469 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0x5, reg_a(gpr_out),
1470 		      SHF_SC_R_ROT, 16);
1471 }
1472 
1473 static void
1474 wrp_mul_u32(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1475 	    swreg rreg, bool gen_high_half)
1476 {
1477 	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1478 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_1, rreg);
1479 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_2, rreg);
1480 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_3, rreg);
1481 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_4, rreg);
1482 	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_32x32, MUL_LAST, reg_none());
1483 	if (gen_high_half)
1484 		emit_mul(nfp_prog, dst_hi, MUL_TYPE_STEP_32x32, MUL_LAST_2,
1485 			 reg_none());
1486 	else
1487 		wrp_immed(nfp_prog, dst_hi, 0);
1488 }
1489 
1490 static void
1491 wrp_mul_u16(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1492 	    swreg rreg)
1493 {
1494 	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1495 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_1, rreg);
1496 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_2, rreg);
1497 	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_16x16, MUL_LAST, reg_none());
1498 }
1499 
1500 static int
1501 wrp_mul(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1502 	bool gen_high_half, bool ropnd_from_reg)
1503 {
1504 	swreg multiplier, multiplicand, dst_hi, dst_lo;
1505 	const struct bpf_insn *insn = &meta->insn;
1506 	u32 lopnd_max, ropnd_max;
1507 	u8 dst_reg;
1508 
1509 	dst_reg = insn->dst_reg;
1510 	multiplicand = reg_a(dst_reg * 2);
1511 	dst_hi = reg_both(dst_reg * 2 + 1);
1512 	dst_lo = reg_both(dst_reg * 2);
1513 	lopnd_max = meta->umax_dst;
1514 	if (ropnd_from_reg) {
1515 		multiplier = reg_b(insn->src_reg * 2);
1516 		ropnd_max = meta->umax_src;
1517 	} else {
1518 		u32 imm = insn->imm;
1519 
1520 		multiplier = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1521 		ropnd_max = imm;
1522 	}
1523 	if (lopnd_max > U16_MAX || ropnd_max > U16_MAX)
1524 		wrp_mul_u32(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier,
1525 			    gen_high_half);
1526 	else
1527 		wrp_mul_u16(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier);
1528 
1529 	return 0;
1530 }
1531 
1532 static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm)
1533 {
1534 	swreg dst_both = reg_both(dst), dst_a = reg_a(dst), dst_b = reg_a(dst);
1535 	struct reciprocal_value_adv rvalue;
1536 	u8 pre_shift, exp;
1537 	swreg magic;
1538 
1539 	if (imm > U32_MAX) {
1540 		wrp_immed(nfp_prog, dst_both, 0);
1541 		return 0;
1542 	}
1543 
1544 	/* NOTE: because we are using "reciprocal_value_adv" which doesn't
1545 	 * support "divisor > (1u << 31)", we need to JIT separate NFP sequence
1546 	 * to handle such case which actually equals to the result of unsigned
1547 	 * comparison "dst >= imm" which could be calculated using the following
1548 	 * NFP sequence:
1549 	 *
1550 	 *  alu[--, dst, -, imm]
1551 	 *  immed[imm, 0]
1552 	 *  alu[dst, imm, +carry, 0]
1553 	 *
1554 	 */
1555 	if (imm > 1U << 31) {
1556 		swreg tmp_b = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1557 
1558 		emit_alu(nfp_prog, reg_none(), dst_a, ALU_OP_SUB, tmp_b);
1559 		wrp_immed(nfp_prog, imm_a(nfp_prog), 0);
1560 		emit_alu(nfp_prog, dst_both, imm_a(nfp_prog), ALU_OP_ADD_C,
1561 			 reg_imm(0));
1562 		return 0;
1563 	}
1564 
1565 	rvalue = reciprocal_value_adv(imm, 32);
1566 	exp = rvalue.exp;
1567 	if (rvalue.is_wide_m && !(imm & 1)) {
1568 		pre_shift = fls(imm & -imm) - 1;
1569 		rvalue = reciprocal_value_adv(imm >> pre_shift, 32 - pre_shift);
1570 	} else {
1571 		pre_shift = 0;
1572 	}
1573 	magic = ur_load_imm_any(nfp_prog, rvalue.m, imm_b(nfp_prog));
1574 	if (imm == 1U << exp) {
1575 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1576 			 SHF_SC_R_SHF, exp);
1577 	} else if (rvalue.is_wide_m) {
1578 		wrp_mul_u32(nfp_prog, imm_both(nfp_prog), reg_none(), dst_a,
1579 			    magic, true);
1580 		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_SUB,
1581 			 imm_b(nfp_prog));
1582 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1583 			 SHF_SC_R_SHF, 1);
1584 		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_ADD,
1585 			 imm_b(nfp_prog));
1586 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1587 			 SHF_SC_R_SHF, rvalue.sh - 1);
1588 	} else {
1589 		if (pre_shift)
1590 			emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1591 				 dst_b, SHF_SC_R_SHF, pre_shift);
1592 		wrp_mul_u32(nfp_prog, dst_both, reg_none(), dst_a, magic, true);
1593 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1594 			 dst_b, SHF_SC_R_SHF, rvalue.sh);
1595 	}
1596 
1597 	return 0;
1598 }
1599 
1600 static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1601 {
1602 	swreg tmp = imm_a(nfp_prog), tmp_len = imm_b(nfp_prog);
1603 	struct nfp_bpf_cap_adjust_head *adjust_head;
1604 	u32 ret_einval, end;
1605 
1606 	adjust_head = &nfp_prog->bpf->adjust_head;
1607 
1608 	/* Optimized version - 5 vs 14 cycles */
1609 	if (nfp_prog->adjust_head_location != UINT_MAX) {
1610 		if (WARN_ON_ONCE(nfp_prog->adjust_head_location != meta->n))
1611 			return -EINVAL;
1612 
1613 		emit_alu(nfp_prog, pptr_reg(nfp_prog),
1614 			 reg_a(2 * 2), ALU_OP_ADD, pptr_reg(nfp_prog));
1615 		emit_alu(nfp_prog, plen_reg(nfp_prog),
1616 			 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1617 		emit_alu(nfp_prog, pv_len(nfp_prog),
1618 			 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1619 
1620 		wrp_immed(nfp_prog, reg_both(0), 0);
1621 		wrp_immed(nfp_prog, reg_both(1), 0);
1622 
1623 		/* TODO: when adjust head is guaranteed to succeed we can
1624 		 * also eliminate the following if (r0 == 0) branch.
1625 		 */
1626 
1627 		return 0;
1628 	}
1629 
1630 	ret_einval = nfp_prog_current_offset(nfp_prog) + 14;
1631 	end = ret_einval + 2;
1632 
1633 	/* We need to use a temp because offset is just a part of the pkt ptr */
1634 	emit_alu(nfp_prog, tmp,
1635 		 reg_a(2 * 2), ALU_OP_ADD_2B, pptr_reg(nfp_prog));
1636 
1637 	/* Validate result will fit within FW datapath constraints */
1638 	emit_alu(nfp_prog, reg_none(),
1639 		 tmp, ALU_OP_SUB, reg_imm(adjust_head->off_min));
1640 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1641 	emit_alu(nfp_prog, reg_none(),
1642 		 reg_imm(adjust_head->off_max), ALU_OP_SUB, tmp);
1643 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1644 
1645 	/* Validate the length is at least ETH_HLEN */
1646 	emit_alu(nfp_prog, tmp_len,
1647 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1648 	emit_alu(nfp_prog, reg_none(),
1649 		 tmp_len, ALU_OP_SUB, reg_imm(ETH_HLEN));
1650 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1651 
1652 	/* Load the ret code */
1653 	wrp_immed(nfp_prog, reg_both(0), 0);
1654 	wrp_immed(nfp_prog, reg_both(1), 0);
1655 
1656 	/* Modify the packet metadata */
1657 	emit_ld_field(nfp_prog, pptr_reg(nfp_prog), 0x3, tmp, SHF_SC_NONE, 0);
1658 
1659 	/* Skip over the -EINVAL ret code (defer 2) */
1660 	emit_br(nfp_prog, BR_UNC, end, 2);
1661 
1662 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1663 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1664 	emit_alu(nfp_prog, pv_len(nfp_prog),
1665 		 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1666 
1667 	/* return -EINVAL target */
1668 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1669 		return -EINVAL;
1670 
1671 	wrp_immed(nfp_prog, reg_both(0), -22);
1672 	wrp_immed(nfp_prog, reg_both(1), ~0);
1673 
1674 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1675 		return -EINVAL;
1676 
1677 	return 0;
1678 }
1679 
1680 static int adjust_tail(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1681 {
1682 	u32 ret_einval, end;
1683 	swreg plen, delta;
1684 
1685 	BUILD_BUG_ON(plen_reg(nfp_prog) != reg_b(STATIC_REG_PKT_LEN));
1686 
1687 	plen = imm_a(nfp_prog);
1688 	delta = reg_a(2 * 2);
1689 
1690 	ret_einval = nfp_prog_current_offset(nfp_prog) + 9;
1691 	end = nfp_prog_current_offset(nfp_prog) + 11;
1692 
1693 	/* Calculate resulting length */
1694 	emit_alu(nfp_prog, plen, plen_reg(nfp_prog), ALU_OP_ADD, delta);
1695 	/* delta == 0 is not allowed by the kernel, add must overflow to make
1696 	 * length smaller.
1697 	 */
1698 	emit_br(nfp_prog, BR_BCC, ret_einval, 0);
1699 
1700 	/* if (new_len < 14) then -EINVAL */
1701 	emit_alu(nfp_prog, reg_none(), plen, ALU_OP_SUB, reg_imm(ETH_HLEN));
1702 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1703 
1704 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1705 		 plen_reg(nfp_prog), ALU_OP_ADD, delta);
1706 	emit_alu(nfp_prog, pv_len(nfp_prog),
1707 		 pv_len(nfp_prog), ALU_OP_ADD, delta);
1708 
1709 	emit_br(nfp_prog, BR_UNC, end, 2);
1710 	wrp_immed(nfp_prog, reg_both(0), 0);
1711 	wrp_immed(nfp_prog, reg_both(1), 0);
1712 
1713 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1714 		return -EINVAL;
1715 
1716 	wrp_immed(nfp_prog, reg_both(0), -22);
1717 	wrp_immed(nfp_prog, reg_both(1), ~0);
1718 
1719 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1720 		return -EINVAL;
1721 
1722 	return 0;
1723 }
1724 
1725 static int
1726 map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1727 {
1728 	bool load_lm_ptr;
1729 	u32 ret_tgt;
1730 	s64 lm_off;
1731 
1732 	/* We only have to reload LM0 if the key is not at start of stack */
1733 	lm_off = nfp_prog->stack_frame_depth;
1734 	lm_off += meta->arg2.reg.var_off.value + meta->arg2.reg.off;
1735 	load_lm_ptr = meta->arg2.var_off || lm_off;
1736 
1737 	/* Set LM0 to start of key */
1738 	if (load_lm_ptr)
1739 		emit_csr_wr(nfp_prog, reg_b(2 * 2), NFP_CSR_ACT_LM_ADDR0);
1740 	if (meta->func_id == BPF_FUNC_map_update_elem)
1741 		emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2);
1742 
1743 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1744 		     2, RELO_BR_HELPER);
1745 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
1746 
1747 	/* Load map ID into A0 */
1748 	wrp_mov(nfp_prog, reg_a(0), reg_a(2));
1749 
1750 	/* Load the return address into B0 */
1751 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1752 
1753 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1754 		return -EINVAL;
1755 
1756 	/* Reset the LM0 pointer */
1757 	if (!load_lm_ptr)
1758 		return 0;
1759 
1760 	emit_csr_wr(nfp_prog, stack_reg(nfp_prog), NFP_CSR_ACT_LM_ADDR0);
1761 	wrp_nops(nfp_prog, 3);
1762 
1763 	return 0;
1764 }
1765 
1766 static int
1767 nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1768 {
1769 	__emit_csr_rd(nfp_prog, NFP_CSR_PSEUDO_RND_NUM);
1770 	/* CSR value is read in following immed[gpr, 0] */
1771 	emit_immed(nfp_prog, reg_both(0), 0,
1772 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1773 	emit_immed(nfp_prog, reg_both(1), 0,
1774 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1775 	return 0;
1776 }
1777 
1778 static int
1779 nfp_perf_event_output(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1780 {
1781 	swreg ptr_type;
1782 	u32 ret_tgt;
1783 
1784 	ptr_type = ur_load_imm_any(nfp_prog, meta->arg1.type, imm_a(nfp_prog));
1785 
1786 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
1787 
1788 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1789 		     2, RELO_BR_HELPER);
1790 
1791 	/* Load ptr type into A1 */
1792 	wrp_mov(nfp_prog, reg_a(1), ptr_type);
1793 
1794 	/* Load the return address into B0 */
1795 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1796 
1797 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1798 		return -EINVAL;
1799 
1800 	return 0;
1801 }
1802 
1803 static int
1804 nfp_queue_select(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1805 {
1806 	u32 jmp_tgt;
1807 
1808 	jmp_tgt = nfp_prog_current_offset(nfp_prog) + 5;
1809 
1810 	/* Make sure the queue id fits into FW field */
1811 	emit_alu(nfp_prog, reg_none(), reg_a(meta->insn.src_reg * 2),
1812 		 ALU_OP_AND_NOT_B, reg_imm(0xff));
1813 	emit_br(nfp_prog, BR_BEQ, jmp_tgt, 2);
1814 
1815 	/* Set the 'queue selected' bit and the queue value */
1816 	emit_shf(nfp_prog, pv_qsel_set(nfp_prog),
1817 		 pv_qsel_set(nfp_prog), SHF_OP_OR, reg_imm(1),
1818 		 SHF_SC_L_SHF, PKT_VEL_QSEL_SET_BIT);
1819 	emit_ld_field(nfp_prog,
1820 		      pv_qsel_val(nfp_prog), 0x1, reg_b(meta->insn.src_reg * 2),
1821 		      SHF_SC_NONE, 0);
1822 	/* Delay slots end here, we will jump over next instruction if queue
1823 	 * value fits into the field.
1824 	 */
1825 	emit_ld_field(nfp_prog,
1826 		      pv_qsel_val(nfp_prog), 0x1, reg_imm(NFP_NET_RXR_MAX),
1827 		      SHF_SC_NONE, 0);
1828 
1829 	if (!nfp_prog_confirm_current_offset(nfp_prog, jmp_tgt))
1830 		return -EINVAL;
1831 
1832 	return 0;
1833 }
1834 
1835 /* --- Callbacks --- */
1836 static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1837 {
1838 	const struct bpf_insn *insn = &meta->insn;
1839 	u8 dst = insn->dst_reg * 2;
1840 	u8 src = insn->src_reg * 2;
1841 
1842 	if (insn->src_reg == BPF_REG_10) {
1843 		swreg stack_depth_reg;
1844 
1845 		stack_depth_reg = ur_load_imm_any(nfp_prog,
1846 						  nfp_prog->stack_frame_depth,
1847 						  stack_imm(nfp_prog));
1848 		emit_alu(nfp_prog, reg_both(dst), stack_reg(nfp_prog),
1849 			 ALU_OP_ADD, stack_depth_reg);
1850 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
1851 	} else {
1852 		wrp_reg_mov(nfp_prog, dst, src);
1853 		wrp_reg_mov(nfp_prog, dst + 1, src + 1);
1854 	}
1855 
1856 	return 0;
1857 }
1858 
1859 static int mov_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1860 {
1861 	u64 imm = meta->insn.imm; /* sign extend */
1862 
1863 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2), imm & ~0U);
1864 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), imm >> 32);
1865 
1866 	return 0;
1867 }
1868 
1869 static int xor_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1870 {
1871 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_XOR);
1872 }
1873 
1874 static int xor_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1875 {
1876 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_XOR, !meta->insn.imm);
1877 }
1878 
1879 static int and_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1880 {
1881 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_AND);
1882 }
1883 
1884 static int and_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1885 {
1886 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
1887 }
1888 
1889 static int or_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1890 {
1891 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_OR);
1892 }
1893 
1894 static int or_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1895 {
1896 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
1897 }
1898 
1899 static int add_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1900 {
1901 	const struct bpf_insn *insn = &meta->insn;
1902 
1903 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1904 		 reg_a(insn->dst_reg * 2), ALU_OP_ADD,
1905 		 reg_b(insn->src_reg * 2));
1906 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1907 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_ADD_C,
1908 		 reg_b(insn->src_reg * 2 + 1));
1909 
1910 	return 0;
1911 }
1912 
1913 static int add_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1914 {
1915 	const struct bpf_insn *insn = &meta->insn;
1916 	u64 imm = insn->imm; /* sign extend */
1917 
1918 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_ADD, imm & ~0U);
1919 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_ADD_C, imm >> 32);
1920 
1921 	return 0;
1922 }
1923 
1924 static int sub_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1925 {
1926 	const struct bpf_insn *insn = &meta->insn;
1927 
1928 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1929 		 reg_a(insn->dst_reg * 2), ALU_OP_SUB,
1930 		 reg_b(insn->src_reg * 2));
1931 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1932 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_SUB_C,
1933 		 reg_b(insn->src_reg * 2 + 1));
1934 
1935 	return 0;
1936 }
1937 
1938 static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1939 {
1940 	const struct bpf_insn *insn = &meta->insn;
1941 	u64 imm = insn->imm; /* sign extend */
1942 
1943 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_SUB, imm & ~0U);
1944 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_SUB_C, imm >> 32);
1945 
1946 	return 0;
1947 }
1948 
1949 static int mul_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1950 {
1951 	return wrp_mul(nfp_prog, meta, true, true);
1952 }
1953 
1954 static int mul_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1955 {
1956 	return wrp_mul(nfp_prog, meta, true, false);
1957 }
1958 
1959 static int div_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1960 {
1961 	const struct bpf_insn *insn = &meta->insn;
1962 
1963 	return wrp_div_imm(nfp_prog, insn->dst_reg * 2, insn->imm);
1964 }
1965 
1966 static int div_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1967 {
1968 	/* NOTE: verifier hook has rejected cases for which verifier doesn't
1969 	 * know whether the source operand is constant or not.
1970 	 */
1971 	return wrp_div_imm(nfp_prog, meta->insn.dst_reg * 2, meta->umin_src);
1972 }
1973 
1974 static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1975 {
1976 	const struct bpf_insn *insn = &meta->insn;
1977 
1978 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2), reg_imm(0),
1979 		 ALU_OP_SUB, reg_b(insn->dst_reg * 2));
1980 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1), reg_imm(0),
1981 		 ALU_OP_SUB_C, reg_b(insn->dst_reg * 2 + 1));
1982 
1983 	return 0;
1984 }
1985 
1986 /* Pseudo code:
1987  *   if shift_amt >= 32
1988  *     dst_high = dst_low << shift_amt[4:0]
1989  *     dst_low = 0;
1990  *   else
1991  *     dst_high = (dst_high, dst_low) >> (32 - shift_amt)
1992  *     dst_low = dst_low << shift_amt
1993  *
1994  * The indirect shift will use the same logic at runtime.
1995  */
1996 static int __shl_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
1997 {
1998 	if (!shift_amt)
1999 		return 0;
2000 
2001 	if (shift_amt < 32) {
2002 		emit_shf(nfp_prog, reg_both(dst + 1), reg_a(dst + 1),
2003 			 SHF_OP_NONE, reg_b(dst), SHF_SC_R_DSHF,
2004 			 32 - shift_amt);
2005 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2006 			 reg_b(dst), SHF_SC_L_SHF, shift_amt);
2007 	} else if (shift_amt == 32) {
2008 		wrp_reg_mov(nfp_prog, dst + 1, dst);
2009 		wrp_immed(nfp_prog, reg_both(dst), 0);
2010 	} else if (shift_amt > 32) {
2011 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2012 			 reg_b(dst), SHF_SC_L_SHF, shift_amt - 32);
2013 		wrp_immed(nfp_prog, reg_both(dst), 0);
2014 	}
2015 
2016 	return 0;
2017 }
2018 
2019 static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2020 {
2021 	const struct bpf_insn *insn = &meta->insn;
2022 	u8 dst = insn->dst_reg * 2;
2023 
2024 	return __shl_imm64(nfp_prog, dst, insn->imm);
2025 }
2026 
2027 static void shl_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2028 {
2029 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_imm(32), ALU_OP_SUB,
2030 		 reg_b(src));
2031 	emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR, reg_imm(0));
2032 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), SHF_OP_NONE,
2033 		       reg_b(dst), SHF_SC_R_DSHF);
2034 }
2035 
2036 /* NOTE: for indirect left shift, HIGH part should be calculated first. */
2037 static void shl_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2038 {
2039 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2040 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2041 		       reg_b(dst), SHF_SC_L_SHF);
2042 }
2043 
2044 static void shl_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2045 {
2046 	shl_reg64_lt32_high(nfp_prog, dst, src);
2047 	shl_reg64_lt32_low(nfp_prog, dst, src);
2048 }
2049 
2050 static void shl_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2051 {
2052 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2053 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2054 		       reg_b(dst), SHF_SC_L_SHF);
2055 	wrp_immed(nfp_prog, reg_both(dst), 0);
2056 }
2057 
2058 static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2059 {
2060 	const struct bpf_insn *insn = &meta->insn;
2061 	u64 umin, umax;
2062 	u8 dst, src;
2063 
2064 	dst = insn->dst_reg * 2;
2065 	umin = meta->umin_src;
2066 	umax = meta->umax_src;
2067 	if (umin == umax)
2068 		return __shl_imm64(nfp_prog, dst, umin);
2069 
2070 	src = insn->src_reg * 2;
2071 	if (umax < 32) {
2072 		shl_reg64_lt32(nfp_prog, dst, src);
2073 	} else if (umin >= 32) {
2074 		shl_reg64_ge32(nfp_prog, dst, src);
2075 	} else {
2076 		/* Generate different instruction sequences depending on runtime
2077 		 * value of shift amount.
2078 		 */
2079 		u16 label_ge32, label_end;
2080 
2081 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 7;
2082 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2083 
2084 		shl_reg64_lt32_high(nfp_prog, dst, src);
2085 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2086 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2087 		/* shl_reg64_lt32_low packed in delay slot. */
2088 		shl_reg64_lt32_low(nfp_prog, dst, src);
2089 
2090 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2091 			return -EINVAL;
2092 		shl_reg64_ge32(nfp_prog, dst, src);
2093 
2094 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2095 			return -EINVAL;
2096 	}
2097 
2098 	return 0;
2099 }
2100 
2101 /* Pseudo code:
2102  *   if shift_amt >= 32
2103  *     dst_high = 0;
2104  *     dst_low = dst_high >> shift_amt[4:0]
2105  *   else
2106  *     dst_high = dst_high >> shift_amt
2107  *     dst_low = (dst_high, dst_low) >> shift_amt
2108  *
2109  * The indirect shift will use the same logic at runtime.
2110  */
2111 static int __shr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2112 {
2113 	if (!shift_amt)
2114 		return 0;
2115 
2116 	if (shift_amt < 32) {
2117 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2118 			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2119 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2120 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2121 	} else if (shift_amt == 32) {
2122 		wrp_reg_mov(nfp_prog, dst, dst + 1);
2123 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2124 	} else if (shift_amt > 32) {
2125 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2126 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2127 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2128 	}
2129 
2130 	return 0;
2131 }
2132 
2133 static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2134 {
2135 	const struct bpf_insn *insn = &meta->insn;
2136 	u8 dst = insn->dst_reg * 2;
2137 
2138 	return __shr_imm64(nfp_prog, dst, insn->imm);
2139 }
2140 
2141 /* NOTE: for indirect right shift, LOW part should be calculated first. */
2142 static void shr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2143 {
2144 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2145 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2146 		       reg_b(dst + 1), SHF_SC_R_SHF);
2147 }
2148 
2149 static void shr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2150 {
2151 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2152 	emit_shf_indir(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2153 		       reg_b(dst), SHF_SC_R_DSHF);
2154 }
2155 
2156 static void shr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2157 {
2158 	shr_reg64_lt32_low(nfp_prog, dst, src);
2159 	shr_reg64_lt32_high(nfp_prog, dst, src);
2160 }
2161 
2162 static void shr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2163 {
2164 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2165 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2166 		       reg_b(dst + 1), SHF_SC_R_SHF);
2167 	wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2168 }
2169 
2170 static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2171 {
2172 	const struct bpf_insn *insn = &meta->insn;
2173 	u64 umin, umax;
2174 	u8 dst, src;
2175 
2176 	dst = insn->dst_reg * 2;
2177 	umin = meta->umin_src;
2178 	umax = meta->umax_src;
2179 	if (umin == umax)
2180 		return __shr_imm64(nfp_prog, dst, umin);
2181 
2182 	src = insn->src_reg * 2;
2183 	if (umax < 32) {
2184 		shr_reg64_lt32(nfp_prog, dst, src);
2185 	} else if (umin >= 32) {
2186 		shr_reg64_ge32(nfp_prog, dst, src);
2187 	} else {
2188 		/* Generate different instruction sequences depending on runtime
2189 		 * value of shift amount.
2190 		 */
2191 		u16 label_ge32, label_end;
2192 
2193 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2194 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2195 		shr_reg64_lt32_low(nfp_prog, dst, src);
2196 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2197 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2198 		/* shr_reg64_lt32_high packed in delay slot. */
2199 		shr_reg64_lt32_high(nfp_prog, dst, src);
2200 
2201 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2202 			return -EINVAL;
2203 		shr_reg64_ge32(nfp_prog, dst, src);
2204 
2205 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2206 			return -EINVAL;
2207 	}
2208 
2209 	return 0;
2210 }
2211 
2212 /* Code logic is the same as __shr_imm64 except ashr requires signedness bit
2213  * told through PREV_ALU result.
2214  */
2215 static int __ashr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2216 {
2217 	if (!shift_amt)
2218 		return 0;
2219 
2220 	if (shift_amt < 32) {
2221 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2222 			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2223 		/* Set signedness bit. */
2224 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2225 			 reg_imm(0));
2226 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2227 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2228 	} else if (shift_amt == 32) {
2229 		/* NOTE: this also helps setting signedness bit. */
2230 		wrp_reg_mov(nfp_prog, dst, dst + 1);
2231 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2232 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2233 	} else if (shift_amt > 32) {
2234 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2235 			 reg_imm(0));
2236 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2237 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2238 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2239 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2240 	}
2241 
2242 	return 0;
2243 }
2244 
2245 static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2246 {
2247 	const struct bpf_insn *insn = &meta->insn;
2248 	u8 dst = insn->dst_reg * 2;
2249 
2250 	return __ashr_imm64(nfp_prog, dst, insn->imm);
2251 }
2252 
2253 static void ashr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2254 {
2255 	/* NOTE: the first insn will set both indirect shift amount (source A)
2256 	 * and signedness bit (MSB of result).
2257 	 */
2258 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2259 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2260 		       reg_b(dst + 1), SHF_SC_R_SHF);
2261 }
2262 
2263 static void ashr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2264 {
2265 	/* NOTE: it is the same as logic shift because we don't need to shift in
2266 	 * signedness bit when the shift amount is less than 32.
2267 	 */
2268 	return shr_reg64_lt32_low(nfp_prog, dst, src);
2269 }
2270 
2271 static void ashr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2272 {
2273 	ashr_reg64_lt32_low(nfp_prog, dst, src);
2274 	ashr_reg64_lt32_high(nfp_prog, dst, src);
2275 }
2276 
2277 static void ashr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2278 {
2279 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2280 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2281 		       reg_b(dst + 1), SHF_SC_R_SHF);
2282 	emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2283 		 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2284 }
2285 
2286 /* Like ashr_imm64, but need to use indirect shift. */
2287 static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2288 {
2289 	const struct bpf_insn *insn = &meta->insn;
2290 	u64 umin, umax;
2291 	u8 dst, src;
2292 
2293 	dst = insn->dst_reg * 2;
2294 	umin = meta->umin_src;
2295 	umax = meta->umax_src;
2296 	if (umin == umax)
2297 		return __ashr_imm64(nfp_prog, dst, umin);
2298 
2299 	src = insn->src_reg * 2;
2300 	if (umax < 32) {
2301 		ashr_reg64_lt32(nfp_prog, dst, src);
2302 	} else if (umin >= 32) {
2303 		ashr_reg64_ge32(nfp_prog, dst, src);
2304 	} else {
2305 		u16 label_ge32, label_end;
2306 
2307 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2308 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2309 		ashr_reg64_lt32_low(nfp_prog, dst, src);
2310 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2311 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2312 		/* ashr_reg64_lt32_high packed in delay slot. */
2313 		ashr_reg64_lt32_high(nfp_prog, dst, src);
2314 
2315 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2316 			return -EINVAL;
2317 		ashr_reg64_ge32(nfp_prog, dst, src);
2318 
2319 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2320 			return -EINVAL;
2321 	}
2322 
2323 	return 0;
2324 }
2325 
2326 static int mov_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2327 {
2328 	const struct bpf_insn *insn = &meta->insn;
2329 
2330 	wrp_reg_mov(nfp_prog, insn->dst_reg * 2,  insn->src_reg * 2);
2331 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2332 
2333 	return 0;
2334 }
2335 
2336 static int mov_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2337 {
2338 	const struct bpf_insn *insn = &meta->insn;
2339 
2340 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2), insn->imm);
2341 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2342 
2343 	return 0;
2344 }
2345 
2346 static int xor_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2347 {
2348 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_XOR);
2349 }
2350 
2351 static int xor_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2352 {
2353 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_XOR);
2354 }
2355 
2356 static int and_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2357 {
2358 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_AND);
2359 }
2360 
2361 static int and_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2362 {
2363 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_AND);
2364 }
2365 
2366 static int or_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2367 {
2368 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_OR);
2369 }
2370 
2371 static int or_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2372 {
2373 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_OR);
2374 }
2375 
2376 static int add_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2377 {
2378 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_ADD);
2379 }
2380 
2381 static int add_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2382 {
2383 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_ADD);
2384 }
2385 
2386 static int sub_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2387 {
2388 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_SUB);
2389 }
2390 
2391 static int sub_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2392 {
2393 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_SUB);
2394 }
2395 
2396 static int mul_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2397 {
2398 	return wrp_mul(nfp_prog, meta, false, true);
2399 }
2400 
2401 static int mul_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2402 {
2403 	return wrp_mul(nfp_prog, meta, false, false);
2404 }
2405 
2406 static int div_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2407 {
2408 	return div_reg64(nfp_prog, meta);
2409 }
2410 
2411 static int div_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2412 {
2413 	return div_imm64(nfp_prog, meta);
2414 }
2415 
2416 static int neg_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2417 {
2418 	u8 dst = meta->insn.dst_reg * 2;
2419 
2420 	emit_alu(nfp_prog, reg_both(dst), reg_imm(0), ALU_OP_SUB, reg_b(dst));
2421 	wrp_zext(nfp_prog, meta, dst);
2422 
2423 	return 0;
2424 }
2425 
2426 static int
2427 __ashr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst,
2428 	   u8 shift_amt)
2429 {
2430 	if (shift_amt) {
2431 		/* Set signedness bit (MSB of result). */
2432 		emit_alu(nfp_prog, reg_none(), reg_a(dst), ALU_OP_OR,
2433 			 reg_imm(0));
2434 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2435 			 reg_b(dst), SHF_SC_R_SHF, shift_amt);
2436 	}
2437 	wrp_zext(nfp_prog, meta, dst);
2438 
2439 	return 0;
2440 }
2441 
2442 static int ashr_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2443 {
2444 	const struct bpf_insn *insn = &meta->insn;
2445 	u64 umin, umax;
2446 	u8 dst, src;
2447 
2448 	dst = insn->dst_reg * 2;
2449 	umin = meta->umin_src;
2450 	umax = meta->umax_src;
2451 	if (umin == umax)
2452 		return __ashr_imm(nfp_prog, meta, dst, umin);
2453 
2454 	src = insn->src_reg * 2;
2455 	/* NOTE: the first insn will set both indirect shift amount (source A)
2456 	 * and signedness bit (MSB of result).
2457 	 */
2458 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst));
2459 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2460 		       reg_b(dst), SHF_SC_R_SHF);
2461 	wrp_zext(nfp_prog, meta, dst);
2462 
2463 	return 0;
2464 }
2465 
2466 static int ashr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2467 {
2468 	const struct bpf_insn *insn = &meta->insn;
2469 	u8 dst = insn->dst_reg * 2;
2470 
2471 	return __ashr_imm(nfp_prog, meta, dst, insn->imm);
2472 }
2473 
2474 static int
2475 __shr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst,
2476 	  u8 shift_amt)
2477 {
2478 	if (shift_amt)
2479 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2480 			 reg_b(dst), SHF_SC_R_SHF, shift_amt);
2481 	wrp_zext(nfp_prog, meta, dst);
2482 	return 0;
2483 }
2484 
2485 static int shr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2486 {
2487 	const struct bpf_insn *insn = &meta->insn;
2488 	u8 dst = insn->dst_reg * 2;
2489 
2490 	return __shr_imm(nfp_prog, meta, dst, insn->imm);
2491 }
2492 
2493 static int shr_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2494 {
2495 	const struct bpf_insn *insn = &meta->insn;
2496 	u64 umin, umax;
2497 	u8 dst, src;
2498 
2499 	dst = insn->dst_reg * 2;
2500 	umin = meta->umin_src;
2501 	umax = meta->umax_src;
2502 	if (umin == umax)
2503 		return __shr_imm(nfp_prog, meta, dst, umin);
2504 
2505 	src = insn->src_reg * 2;
2506 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2507 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2508 		       reg_b(dst), SHF_SC_R_SHF);
2509 	wrp_zext(nfp_prog, meta, dst);
2510 	return 0;
2511 }
2512 
2513 static int
2514 __shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst,
2515 	  u8 shift_amt)
2516 {
2517 	if (shift_amt)
2518 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2519 			 reg_b(dst), SHF_SC_L_SHF, shift_amt);
2520 	wrp_zext(nfp_prog, meta, dst);
2521 	return 0;
2522 }
2523 
2524 static int shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2525 {
2526 	const struct bpf_insn *insn = &meta->insn;
2527 	u8 dst = insn->dst_reg * 2;
2528 
2529 	return __shl_imm(nfp_prog, meta, dst, insn->imm);
2530 }
2531 
2532 static int shl_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2533 {
2534 	const struct bpf_insn *insn = &meta->insn;
2535 	u64 umin, umax;
2536 	u8 dst, src;
2537 
2538 	dst = insn->dst_reg * 2;
2539 	umin = meta->umin_src;
2540 	umax = meta->umax_src;
2541 	if (umin == umax)
2542 		return __shl_imm(nfp_prog, meta, dst, umin);
2543 
2544 	src = insn->src_reg * 2;
2545 	shl_reg64_lt32_low(nfp_prog, dst, src);
2546 	wrp_zext(nfp_prog, meta, dst);
2547 	return 0;
2548 }
2549 
2550 static int end_reg32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2551 {
2552 	const struct bpf_insn *insn = &meta->insn;
2553 	u8 gpr = insn->dst_reg * 2;
2554 
2555 	switch (insn->imm) {
2556 	case 16:
2557 		emit_ld_field(nfp_prog, reg_both(gpr), 0x9, reg_b(gpr),
2558 			      SHF_SC_R_ROT, 8);
2559 		emit_ld_field(nfp_prog, reg_both(gpr), 0xe, reg_a(gpr),
2560 			      SHF_SC_R_SHF, 16);
2561 
2562 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2563 		break;
2564 	case 32:
2565 		wrp_end32(nfp_prog, reg_a(gpr), gpr);
2566 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2567 		break;
2568 	case 64:
2569 		wrp_mov(nfp_prog, imm_a(nfp_prog), reg_b(gpr + 1));
2570 
2571 		wrp_end32(nfp_prog, reg_a(gpr), gpr + 1);
2572 		wrp_end32(nfp_prog, imm_a(nfp_prog), gpr);
2573 		break;
2574 	}
2575 
2576 	return 0;
2577 }
2578 
2579 static int imm_ld8_part2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2580 {
2581 	struct nfp_insn_meta *prev = nfp_meta_prev(meta);
2582 	u32 imm_lo, imm_hi;
2583 	u8 dst;
2584 
2585 	dst = prev->insn.dst_reg * 2;
2586 	imm_lo = prev->insn.imm;
2587 	imm_hi = meta->insn.imm;
2588 
2589 	wrp_immed(nfp_prog, reg_both(dst), imm_lo);
2590 
2591 	/* mov is always 1 insn, load imm may be two, so try to use mov */
2592 	if (imm_hi == imm_lo)
2593 		wrp_mov(nfp_prog, reg_both(dst + 1), reg_a(dst));
2594 	else
2595 		wrp_immed(nfp_prog, reg_both(dst + 1), imm_hi);
2596 
2597 	return 0;
2598 }
2599 
2600 static int imm_ld8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2601 {
2602 	meta->double_cb = imm_ld8_part2;
2603 	return 0;
2604 }
2605 
2606 static int data_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2607 {
2608 	return construct_data_ld(nfp_prog, meta, meta->insn.imm, 1);
2609 }
2610 
2611 static int data_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2612 {
2613 	return construct_data_ld(nfp_prog, meta, meta->insn.imm, 2);
2614 }
2615 
2616 static int data_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2617 {
2618 	return construct_data_ld(nfp_prog, meta, meta->insn.imm, 4);
2619 }
2620 
2621 static int data_ind_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2622 {
2623 	return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm,
2624 				     meta->insn.src_reg * 2, 1);
2625 }
2626 
2627 static int data_ind_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2628 {
2629 	return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm,
2630 				     meta->insn.src_reg * 2, 2);
2631 }
2632 
2633 static int data_ind_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2634 {
2635 	return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm,
2636 				     meta->insn.src_reg * 2, 4);
2637 }
2638 
2639 static int
2640 mem_ldx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2641 	      unsigned int size, unsigned int ptr_off)
2642 {
2643 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2644 			    meta->insn.dst_reg * 2, meta->insn.src_reg * 2,
2645 			    true, wrp_lmem_load);
2646 }
2647 
2648 static int mem_ldx_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2649 		       u8 size)
2650 {
2651 	swreg dst = reg_both(meta->insn.dst_reg * 2);
2652 
2653 	switch (meta->insn.off) {
2654 	case offsetof(struct __sk_buff, len):
2655 		if (size != sizeof_field(struct __sk_buff, len))
2656 			return -EOPNOTSUPP;
2657 		wrp_mov(nfp_prog, dst, plen_reg(nfp_prog));
2658 		break;
2659 	case offsetof(struct __sk_buff, data):
2660 		if (size != sizeof_field(struct __sk_buff, data))
2661 			return -EOPNOTSUPP;
2662 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2663 		break;
2664 	case offsetof(struct __sk_buff, data_end):
2665 		if (size != sizeof_field(struct __sk_buff, data_end))
2666 			return -EOPNOTSUPP;
2667 		emit_alu(nfp_prog, dst,
2668 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2669 		break;
2670 	default:
2671 		return -EOPNOTSUPP;
2672 	}
2673 
2674 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2675 
2676 	return 0;
2677 }
2678 
2679 static int mem_ldx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2680 		       u8 size)
2681 {
2682 	swreg dst = reg_both(meta->insn.dst_reg * 2);
2683 
2684 	switch (meta->insn.off) {
2685 	case offsetof(struct xdp_md, data):
2686 		if (size != sizeof_field(struct xdp_md, data))
2687 			return -EOPNOTSUPP;
2688 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2689 		break;
2690 	case offsetof(struct xdp_md, data_end):
2691 		if (size != sizeof_field(struct xdp_md, data_end))
2692 			return -EOPNOTSUPP;
2693 		emit_alu(nfp_prog, dst,
2694 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2695 		break;
2696 	default:
2697 		return -EOPNOTSUPP;
2698 	}
2699 
2700 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2701 
2702 	return 0;
2703 }
2704 
2705 static int
2706 mem_ldx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2707 	     unsigned int size)
2708 {
2709 	swreg tmp_reg;
2710 
2711 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2712 
2713 	return data_ld_host_order_addr32(nfp_prog, meta, meta->insn.src_reg * 2,
2714 					 tmp_reg, meta->insn.dst_reg * 2, size);
2715 }
2716 
2717 static int
2718 mem_ldx_emem(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2719 	     unsigned int size)
2720 {
2721 	swreg tmp_reg;
2722 
2723 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2724 
2725 	return data_ld_host_order_addr40(nfp_prog, meta, meta->insn.src_reg * 2,
2726 					 tmp_reg, meta->insn.dst_reg * 2, size);
2727 }
2728 
2729 static void
2730 mem_ldx_data_init_pktcache(struct nfp_prog *nfp_prog,
2731 			   struct nfp_insn_meta *meta)
2732 {
2733 	s16 range_start = meta->pkt_cache.range_start;
2734 	s16 range_end = meta->pkt_cache.range_end;
2735 	swreg src_base, off;
2736 	u8 xfer_num, len;
2737 	bool indir;
2738 
2739 	off = re_load_imm_any(nfp_prog, range_start, imm_b(nfp_prog));
2740 	src_base = reg_a(meta->insn.src_reg * 2);
2741 	len = range_end - range_start;
2742 	xfer_num = round_up(len, REG_WIDTH) / REG_WIDTH;
2743 
2744 	indir = len > 8 * REG_WIDTH;
2745 	/* Setup PREV_ALU for indirect mode. */
2746 	if (indir)
2747 		wrp_immed(nfp_prog, reg_none(),
2748 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
2749 
2750 	/* Cache memory into transfer-in registers. */
2751 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base,
2752 		     off, xfer_num - 1, CMD_CTX_SWAP, indir);
2753 }
2754 
2755 static int
2756 mem_ldx_data_from_pktcache_unaligned(struct nfp_prog *nfp_prog,
2757 				     struct nfp_insn_meta *meta,
2758 				     unsigned int size)
2759 {
2760 	s16 range_start = meta->pkt_cache.range_start;
2761 	s16 insn_off = meta->insn.off - range_start;
2762 	swreg dst_lo, dst_hi, src_lo, src_mid;
2763 	u8 dst_gpr = meta->insn.dst_reg * 2;
2764 	u8 len_lo = size, len_mid = 0;
2765 	u8 idx = insn_off / REG_WIDTH;
2766 	u8 off = insn_off % REG_WIDTH;
2767 
2768 	dst_hi = reg_both(dst_gpr + 1);
2769 	dst_lo = reg_both(dst_gpr);
2770 	src_lo = reg_xfer(idx);
2771 
2772 	/* The read length could involve as many as three registers. */
2773 	if (size > REG_WIDTH - off) {
2774 		/* Calculate the part in the second register. */
2775 		len_lo = REG_WIDTH - off;
2776 		len_mid = size - len_lo;
2777 
2778 		/* Calculate the part in the third register. */
2779 		if (size > 2 * REG_WIDTH - off)
2780 			len_mid = REG_WIDTH;
2781 	}
2782 
2783 	wrp_reg_subpart(nfp_prog, dst_lo, src_lo, len_lo, off);
2784 
2785 	if (!len_mid) {
2786 		wrp_zext(nfp_prog, meta, dst_gpr);
2787 		return 0;
2788 	}
2789 
2790 	src_mid = reg_xfer(idx + 1);
2791 
2792 	if (size <= REG_WIDTH) {
2793 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid, len_mid, len_lo);
2794 		wrp_zext(nfp_prog, meta, dst_gpr);
2795 	} else {
2796 		swreg src_hi = reg_xfer(idx + 2);
2797 
2798 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid,
2799 				   REG_WIDTH - len_lo, len_lo);
2800 		wrp_reg_subpart(nfp_prog, dst_hi, src_mid, len_lo,
2801 				REG_WIDTH - len_lo);
2802 		wrp_reg_or_subpart(nfp_prog, dst_hi, src_hi, REG_WIDTH - len_lo,
2803 				   len_lo);
2804 	}
2805 
2806 	return 0;
2807 }
2808 
2809 static int
2810 mem_ldx_data_from_pktcache_aligned(struct nfp_prog *nfp_prog,
2811 				   struct nfp_insn_meta *meta,
2812 				   unsigned int size)
2813 {
2814 	swreg dst_lo, dst_hi, src_lo;
2815 	u8 dst_gpr, idx;
2816 
2817 	idx = (meta->insn.off - meta->pkt_cache.range_start) / REG_WIDTH;
2818 	dst_gpr = meta->insn.dst_reg * 2;
2819 	dst_hi = reg_both(dst_gpr + 1);
2820 	dst_lo = reg_both(dst_gpr);
2821 	src_lo = reg_xfer(idx);
2822 
2823 	if (size < REG_WIDTH) {
2824 		wrp_reg_subpart(nfp_prog, dst_lo, src_lo, size, 0);
2825 		wrp_zext(nfp_prog, meta, dst_gpr);
2826 	} else if (size == REG_WIDTH) {
2827 		wrp_mov(nfp_prog, dst_lo, src_lo);
2828 		wrp_zext(nfp_prog, meta, dst_gpr);
2829 	} else {
2830 		swreg src_hi = reg_xfer(idx + 1);
2831 
2832 		wrp_mov(nfp_prog, dst_lo, src_lo);
2833 		wrp_mov(nfp_prog, dst_hi, src_hi);
2834 	}
2835 
2836 	return 0;
2837 }
2838 
2839 static int
2840 mem_ldx_data_from_pktcache(struct nfp_prog *nfp_prog,
2841 			   struct nfp_insn_meta *meta, unsigned int size)
2842 {
2843 	u8 off = meta->insn.off - meta->pkt_cache.range_start;
2844 
2845 	if (IS_ALIGNED(off, REG_WIDTH))
2846 		return mem_ldx_data_from_pktcache_aligned(nfp_prog, meta, size);
2847 
2848 	return mem_ldx_data_from_pktcache_unaligned(nfp_prog, meta, size);
2849 }
2850 
2851 static int
2852 mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2853 	unsigned int size)
2854 {
2855 	if (meta->ldst_gather_len)
2856 		return nfp_cpp_memcpy(nfp_prog, meta);
2857 
2858 	if (meta->ptr.type == PTR_TO_CTX) {
2859 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2860 			return mem_ldx_xdp(nfp_prog, meta, size);
2861 		else
2862 			return mem_ldx_skb(nfp_prog, meta, size);
2863 	}
2864 
2865 	if (meta->ptr.type == PTR_TO_PACKET) {
2866 		if (meta->pkt_cache.range_end) {
2867 			if (meta->pkt_cache.do_init)
2868 				mem_ldx_data_init_pktcache(nfp_prog, meta);
2869 
2870 			return mem_ldx_data_from_pktcache(nfp_prog, meta, size);
2871 		} else {
2872 			return mem_ldx_data(nfp_prog, meta, size);
2873 		}
2874 	}
2875 
2876 	if (meta->ptr.type == PTR_TO_STACK)
2877 		return mem_ldx_stack(nfp_prog, meta, size,
2878 				     meta->ptr.off + meta->ptr.var_off.value);
2879 
2880 	if (meta->ptr.type == PTR_TO_MAP_VALUE)
2881 		return mem_ldx_emem(nfp_prog, meta, size);
2882 
2883 	return -EOPNOTSUPP;
2884 }
2885 
2886 static int mem_ldx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2887 {
2888 	return mem_ldx(nfp_prog, meta, 1);
2889 }
2890 
2891 static int mem_ldx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2892 {
2893 	return mem_ldx(nfp_prog, meta, 2);
2894 }
2895 
2896 static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2897 {
2898 	return mem_ldx(nfp_prog, meta, 4);
2899 }
2900 
2901 static int mem_ldx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2902 {
2903 	return mem_ldx(nfp_prog, meta, 8);
2904 }
2905 
2906 static int
2907 mem_st_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2908 	    unsigned int size)
2909 {
2910 	u64 imm = meta->insn.imm; /* sign extend */
2911 	swreg off_reg;
2912 
2913 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2914 
2915 	return data_st_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2916 				  imm, size);
2917 }
2918 
2919 static int mem_st(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2920 		  unsigned int size)
2921 {
2922 	if (meta->ptr.type == PTR_TO_PACKET)
2923 		return mem_st_data(nfp_prog, meta, size);
2924 
2925 	return -EOPNOTSUPP;
2926 }
2927 
2928 static int mem_st1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2929 {
2930 	return mem_st(nfp_prog, meta, 1);
2931 }
2932 
2933 static int mem_st2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2934 {
2935 	return mem_st(nfp_prog, meta, 2);
2936 }
2937 
2938 static int mem_st4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2939 {
2940 	return mem_st(nfp_prog, meta, 4);
2941 }
2942 
2943 static int mem_st8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2944 {
2945 	return mem_st(nfp_prog, meta, 8);
2946 }
2947 
2948 static int
2949 mem_stx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2950 	     unsigned int size)
2951 {
2952 	swreg off_reg;
2953 
2954 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2955 
2956 	return data_stx_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2957 				   meta->insn.src_reg * 2, size);
2958 }
2959 
2960 static int
2961 mem_stx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2962 	      unsigned int size, unsigned int ptr_off)
2963 {
2964 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2965 			    meta->insn.src_reg * 2, meta->insn.dst_reg * 2,
2966 			    false, wrp_lmem_store);
2967 }
2968 
2969 static int mem_stx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2970 {
2971 	switch (meta->insn.off) {
2972 	case offsetof(struct xdp_md, rx_queue_index):
2973 		return nfp_queue_select(nfp_prog, meta);
2974 	}
2975 
2976 	WARN_ON_ONCE(1); /* verifier should have rejected bad accesses */
2977 	return -EOPNOTSUPP;
2978 }
2979 
2980 static int
2981 mem_stx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2982 	unsigned int size)
2983 {
2984 	if (meta->ptr.type == PTR_TO_PACKET)
2985 		return mem_stx_data(nfp_prog, meta, size);
2986 
2987 	if (meta->ptr.type == PTR_TO_STACK)
2988 		return mem_stx_stack(nfp_prog, meta, size,
2989 				     meta->ptr.off + meta->ptr.var_off.value);
2990 
2991 	return -EOPNOTSUPP;
2992 }
2993 
2994 static int mem_stx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2995 {
2996 	return mem_stx(nfp_prog, meta, 1);
2997 }
2998 
2999 static int mem_stx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3000 {
3001 	return mem_stx(nfp_prog, meta, 2);
3002 }
3003 
3004 static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3005 {
3006 	if (meta->ptr.type == PTR_TO_CTX)
3007 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
3008 			return mem_stx_xdp(nfp_prog, meta);
3009 	return mem_stx(nfp_prog, meta, 4);
3010 }
3011 
3012 static int mem_stx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3013 {
3014 	return mem_stx(nfp_prog, meta, 8);
3015 }
3016 
3017 static int
3018 mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64)
3019 {
3020 	u8 dst_gpr = meta->insn.dst_reg * 2;
3021 	u8 src_gpr = meta->insn.src_reg * 2;
3022 	unsigned int full_add, out;
3023 	swreg addra, addrb, off;
3024 
3025 	off = ur_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
3026 
3027 	/* We can fit 16 bits into command immediate, if we know the immediate
3028 	 * is guaranteed to either always or never fit into 16 bit we only
3029 	 * generate code to handle that particular case, otherwise generate
3030 	 * code for both.
3031 	 */
3032 	out = nfp_prog_current_offset(nfp_prog);
3033 	full_add = nfp_prog_current_offset(nfp_prog);
3034 
3035 	if (meta->insn.off) {
3036 		out += 2;
3037 		full_add += 2;
3038 	}
3039 	if (meta->xadd_maybe_16bit) {
3040 		out += 3;
3041 		full_add += 3;
3042 	}
3043 	if (meta->xadd_over_16bit)
3044 		out += 2 + is64;
3045 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
3046 		out += 5;
3047 		full_add += 5;
3048 	}
3049 
3050 	/* Generate the branch for choosing add_imm vs add */
3051 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
3052 		swreg max_imm = imm_a(nfp_prog);
3053 
3054 		wrp_immed(nfp_prog, max_imm, 0xffff);
3055 		emit_alu(nfp_prog, reg_none(),
3056 			 max_imm, ALU_OP_SUB, reg_b(src_gpr));
3057 		emit_alu(nfp_prog, reg_none(),
3058 			 reg_imm(0), ALU_OP_SUB_C, reg_b(src_gpr + 1));
3059 		emit_br(nfp_prog, BR_BLO, full_add, meta->insn.off ? 2 : 0);
3060 		/* defer for add */
3061 	}
3062 
3063 	/* If insn has an offset add to the address */
3064 	if (!meta->insn.off) {
3065 		addra = reg_a(dst_gpr);
3066 		addrb = reg_b(dst_gpr + 1);
3067 	} else {
3068 		emit_alu(nfp_prog, imma_a(nfp_prog),
3069 			 reg_a(dst_gpr), ALU_OP_ADD, off);
3070 		emit_alu(nfp_prog, imma_b(nfp_prog),
3071 			 reg_a(dst_gpr + 1), ALU_OP_ADD_C, reg_imm(0));
3072 		addra = imma_a(nfp_prog);
3073 		addrb = imma_b(nfp_prog);
3074 	}
3075 
3076 	/* Generate the add_imm if 16 bits are possible */
3077 	if (meta->xadd_maybe_16bit) {
3078 		swreg prev_alu = imm_a(nfp_prog);
3079 
3080 		wrp_immed(nfp_prog, prev_alu,
3081 			  FIELD_PREP(CMD_OVE_DATA, 2) |
3082 			  CMD_OVE_LEN |
3083 			  FIELD_PREP(CMD_OV_LEN, 0x8 | is64 << 2));
3084 		wrp_reg_or_subpart(nfp_prog, prev_alu, reg_b(src_gpr), 2, 2);
3085 		emit_cmd_indir(nfp_prog, CMD_TGT_ADD_IMM, CMD_MODE_40b_BA, 0,
3086 			       addra, addrb, 0, CMD_CTX_NO_SWAP);
3087 
3088 		if (meta->xadd_over_16bit)
3089 			emit_br(nfp_prog, BR_UNC, out, 0);
3090 	}
3091 
3092 	if (!nfp_prog_confirm_current_offset(nfp_prog, full_add))
3093 		return -EINVAL;
3094 
3095 	/* Generate the add if 16 bits are not guaranteed */
3096 	if (meta->xadd_over_16bit) {
3097 		emit_cmd(nfp_prog, CMD_TGT_ADD, CMD_MODE_40b_BA, 0,
3098 			 addra, addrb, is64 << 2,
3099 			 is64 ? CMD_CTX_SWAP_DEFER2 : CMD_CTX_SWAP_DEFER1);
3100 
3101 		wrp_mov(nfp_prog, reg_xfer(0), reg_a(src_gpr));
3102 		if (is64)
3103 			wrp_mov(nfp_prog, reg_xfer(1), reg_a(src_gpr + 1));
3104 	}
3105 
3106 	if (!nfp_prog_confirm_current_offset(nfp_prog, out))
3107 		return -EINVAL;
3108 
3109 	return 0;
3110 }
3111 
3112 static int mem_atomic4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3113 {
3114 	if (meta->insn.imm != BPF_ADD)
3115 		return -EOPNOTSUPP;
3116 
3117 	return mem_xadd(nfp_prog, meta, false);
3118 }
3119 
3120 static int mem_atomic8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3121 {
3122 	if (meta->insn.imm != BPF_ADD)
3123 		return -EOPNOTSUPP;
3124 
3125 	return mem_xadd(nfp_prog, meta, true);
3126 }
3127 
3128 static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3129 {
3130 	emit_br(nfp_prog, BR_UNC, meta->insn.off, 0);
3131 
3132 	return 0;
3133 }
3134 
3135 static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3136 {
3137 	const struct bpf_insn *insn = &meta->insn;
3138 	u64 imm = insn->imm; /* sign extend */
3139 	swreg or1, or2, tmp_reg;
3140 
3141 	or1 = reg_a(insn->dst_reg * 2);
3142 	or2 = reg_b(insn->dst_reg * 2 + 1);
3143 
3144 	if (imm & ~0U) {
3145 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3146 		emit_alu(nfp_prog, imm_a(nfp_prog),
3147 			 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3148 		or1 = imm_a(nfp_prog);
3149 	}
3150 
3151 	if (imm >> 32) {
3152 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3153 		emit_alu(nfp_prog, imm_b(nfp_prog),
3154 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
3155 		or2 = imm_b(nfp_prog);
3156 	}
3157 
3158 	emit_alu(nfp_prog, reg_none(), or1, ALU_OP_OR, or2);
3159 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3160 
3161 	return 0;
3162 }
3163 
3164 static int jeq32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3165 {
3166 	const struct bpf_insn *insn = &meta->insn;
3167 	swreg tmp_reg;
3168 
3169 	tmp_reg = ur_load_imm_any(nfp_prog, insn->imm, imm_b(nfp_prog));
3170 	emit_alu(nfp_prog, reg_none(),
3171 		 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3172 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3173 
3174 	return 0;
3175 }
3176 
3177 static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3178 {
3179 	const struct bpf_insn *insn = &meta->insn;
3180 	u64 imm = insn->imm; /* sign extend */
3181 	u8 dst_gpr = insn->dst_reg * 2;
3182 	swreg tmp_reg;
3183 
3184 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3185 	emit_alu(nfp_prog, imm_b(nfp_prog),
3186 		 reg_a(dst_gpr), ALU_OP_AND, tmp_reg);
3187 	/* Upper word of the mask can only be 0 or ~0 from sign extension,
3188 	 * so either ignore it or OR the whole thing in.
3189 	 */
3190 	if (is_mbpf_jmp64(meta) && imm >> 32) {
3191 		emit_alu(nfp_prog, reg_none(),
3192 			 reg_a(dst_gpr + 1), ALU_OP_OR, imm_b(nfp_prog));
3193 	}
3194 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3195 
3196 	return 0;
3197 }
3198 
3199 static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3200 {
3201 	const struct bpf_insn *insn = &meta->insn;
3202 	u64 imm = insn->imm; /* sign extend */
3203 	bool is_jmp32 = is_mbpf_jmp32(meta);
3204 	swreg tmp_reg;
3205 
3206 	if (!imm) {
3207 		if (is_jmp32)
3208 			emit_alu(nfp_prog, reg_none(), reg_none(), ALU_OP_NONE,
3209 				 reg_b(insn->dst_reg * 2));
3210 		else
3211 			emit_alu(nfp_prog, reg_none(), reg_a(insn->dst_reg * 2),
3212 				 ALU_OP_OR, reg_b(insn->dst_reg * 2 + 1));
3213 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
3214 		return 0;
3215 	}
3216 
3217 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3218 	emit_alu(nfp_prog, reg_none(),
3219 		 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3220 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3221 
3222 	if (is_jmp32)
3223 		return 0;
3224 
3225 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3226 	emit_alu(nfp_prog, reg_none(),
3227 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
3228 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3229 
3230 	return 0;
3231 }
3232 
3233 static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3234 {
3235 	const struct bpf_insn *insn = &meta->insn;
3236 
3237 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(insn->dst_reg * 2),
3238 		 ALU_OP_XOR, reg_b(insn->src_reg * 2));
3239 	if (is_mbpf_jmp64(meta)) {
3240 		emit_alu(nfp_prog, imm_b(nfp_prog),
3241 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR,
3242 			 reg_b(insn->src_reg * 2 + 1));
3243 		emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR,
3244 			 imm_b(nfp_prog));
3245 	}
3246 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3247 
3248 	return 0;
3249 }
3250 
3251 static int jset_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3252 {
3253 	return wrp_test_reg(nfp_prog, meta, ALU_OP_AND, BR_BNE);
3254 }
3255 
3256 static int jne_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3257 {
3258 	return wrp_test_reg(nfp_prog, meta, ALU_OP_XOR, BR_BNE);
3259 }
3260 
3261 static int
3262 bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3263 {
3264 	u32 ret_tgt, stack_depth, offset_br;
3265 	swreg tmp_reg;
3266 
3267 	stack_depth = round_up(nfp_prog->stack_frame_depth, STACK_FRAME_ALIGN);
3268 	/* Space for saving the return address is accounted for by the callee,
3269 	 * so stack_depth can be zero for the main function.
3270 	 */
3271 	if (stack_depth) {
3272 		tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
3273 					  stack_imm(nfp_prog));
3274 		emit_alu(nfp_prog, stack_reg(nfp_prog),
3275 			 stack_reg(nfp_prog), ALU_OP_ADD, tmp_reg);
3276 		emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
3277 			    NFP_CSR_ACT_LM_ADDR0);
3278 	}
3279 
3280 	/* Two cases for jumping to the callee:
3281 	 *
3282 	 * - If callee uses and needs to save R6~R9 then:
3283 	 *     1. Put the start offset of the callee into imm_b(). This will
3284 	 *        require a fixup step, as we do not necessarily know this
3285 	 *        address yet.
3286 	 *     2. Put the return address from the callee to the caller into
3287 	 *        register ret_reg().
3288 	 *     3. (After defer slots are consumed) Jump to the subroutine that
3289 	 *        pushes the registers to the stack.
3290 	 *   The subroutine acts as a trampoline, and returns to the address in
3291 	 *   imm_b(), i.e. jumps to the callee.
3292 	 *
3293 	 * - If callee does not need to save R6~R9 then just load return
3294 	 *   address to the caller in ret_reg(), and jump to the callee
3295 	 *   directly.
3296 	 *
3297 	 * Using ret_reg() to pass the return address to the callee is set here
3298 	 * as a convention. The callee can then push this address onto its
3299 	 * stack frame in its prologue. The advantages of passing the return
3300 	 * address through ret_reg(), instead of pushing it to the stack right
3301 	 * here, are the following:
3302 	 * - It looks cleaner.
3303 	 * - If the called function is called multiple time, we get a lower
3304 	 *   program size.
3305 	 * - We save two no-op instructions that should be added just before
3306 	 *   the emit_br() when stack depth is not null otherwise.
3307 	 * - If we ever find a register to hold the return address during whole
3308 	 *   execution of the callee, we will not have to push the return
3309 	 *   address to the stack for leaf functions.
3310 	 */
3311 	if (!meta->jmp_dst) {
3312 		pr_err("BUG: BPF-to-BPF call has no destination recorded\n");
3313 		return -ELOOP;
3314 	}
3315 	if (nfp_prog->subprog[meta->jmp_dst->subprog_idx].needs_reg_push) {
3316 		ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
3317 		emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
3318 			     RELO_BR_GO_CALL_PUSH_REGS);
3319 		offset_br = nfp_prog_current_offset(nfp_prog);
3320 		wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
3321 	} else {
3322 		ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
3323 		emit_br(nfp_prog, BR_UNC, meta->insn.imm, 1);
3324 		offset_br = nfp_prog_current_offset(nfp_prog);
3325 	}
3326 	wrp_immed_relo(nfp_prog, ret_reg(nfp_prog), ret_tgt, RELO_IMMED_REL);
3327 
3328 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
3329 		return -EINVAL;
3330 
3331 	if (stack_depth) {
3332 		tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
3333 					  stack_imm(nfp_prog));
3334 		emit_alu(nfp_prog, stack_reg(nfp_prog),
3335 			 stack_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
3336 		emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
3337 			    NFP_CSR_ACT_LM_ADDR0);
3338 		wrp_nops(nfp_prog, 3);
3339 	}
3340 
3341 	meta->num_insns_after_br = nfp_prog_current_offset(nfp_prog);
3342 	meta->num_insns_after_br -= offset_br;
3343 
3344 	return 0;
3345 }
3346 
3347 static int helper_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3348 {
3349 	switch (meta->insn.imm) {
3350 	case BPF_FUNC_xdp_adjust_head:
3351 		return adjust_head(nfp_prog, meta);
3352 	case BPF_FUNC_xdp_adjust_tail:
3353 		return adjust_tail(nfp_prog, meta);
3354 	case BPF_FUNC_map_lookup_elem:
3355 	case BPF_FUNC_map_update_elem:
3356 	case BPF_FUNC_map_delete_elem:
3357 		return map_call_stack_common(nfp_prog, meta);
3358 	case BPF_FUNC_get_prandom_u32:
3359 		return nfp_get_prandom_u32(nfp_prog, meta);
3360 	case BPF_FUNC_perf_event_output:
3361 		return nfp_perf_event_output(nfp_prog, meta);
3362 	default:
3363 		WARN_ONCE(1, "verifier allowed unsupported function\n");
3364 		return -EOPNOTSUPP;
3365 	}
3366 }
3367 
3368 static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3369 {
3370 	if (is_mbpf_pseudo_call(meta))
3371 		return bpf_to_bpf_call(nfp_prog, meta);
3372 	else
3373 		return helper_call(nfp_prog, meta);
3374 }
3375 
3376 static bool nfp_is_main_function(struct nfp_insn_meta *meta)
3377 {
3378 	return meta->subprog_idx == 0;
3379 }
3380 
3381 static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3382 {
3383 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 0, RELO_BR_GO_OUT);
3384 
3385 	return 0;
3386 }
3387 
3388 static int
3389 nfp_subprog_epilogue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3390 {
3391 	if (nfp_prog->subprog[meta->subprog_idx].needs_reg_push) {
3392 		/* Pop R6~R9 to the stack via related subroutine.
3393 		 * We loaded the return address to the caller into ret_reg().
3394 		 * This means that the subroutine does not come back here, we
3395 		 * make it jump back to the subprogram caller directly!
3396 		 */
3397 		emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
3398 			     RELO_BR_GO_CALL_POP_REGS);
3399 		/* Pop return address from the stack. */
3400 		wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
3401 	} else {
3402 		/* Pop return address from the stack. */
3403 		wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
3404 		/* Jump back to caller if no callee-saved registers were used
3405 		 * by the subprogram.
3406 		 */
3407 		emit_rtn(nfp_prog, ret_reg(nfp_prog), 0);
3408 	}
3409 
3410 	return 0;
3411 }
3412 
3413 static int jmp_exit(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3414 {
3415 	if (nfp_is_main_function(meta))
3416 		return goto_out(nfp_prog, meta);
3417 	else
3418 		return nfp_subprog_epilogue(nfp_prog, meta);
3419 }
3420 
3421 static const instr_cb_t instr_cb[256] = {
3422 	[BPF_ALU64 | BPF_MOV | BPF_X] =	mov_reg64,
3423 	[BPF_ALU64 | BPF_MOV | BPF_K] =	mov_imm64,
3424 	[BPF_ALU64 | BPF_XOR | BPF_X] =	xor_reg64,
3425 	[BPF_ALU64 | BPF_XOR | BPF_K] =	xor_imm64,
3426 	[BPF_ALU64 | BPF_AND | BPF_X] =	and_reg64,
3427 	[BPF_ALU64 | BPF_AND | BPF_K] =	and_imm64,
3428 	[BPF_ALU64 | BPF_OR | BPF_X] =	or_reg64,
3429 	[BPF_ALU64 | BPF_OR | BPF_K] =	or_imm64,
3430 	[BPF_ALU64 | BPF_ADD | BPF_X] =	add_reg64,
3431 	[BPF_ALU64 | BPF_ADD | BPF_K] =	add_imm64,
3432 	[BPF_ALU64 | BPF_SUB | BPF_X] =	sub_reg64,
3433 	[BPF_ALU64 | BPF_SUB | BPF_K] =	sub_imm64,
3434 	[BPF_ALU64 | BPF_MUL | BPF_X] =	mul_reg64,
3435 	[BPF_ALU64 | BPF_MUL | BPF_K] =	mul_imm64,
3436 	[BPF_ALU64 | BPF_DIV | BPF_X] =	div_reg64,
3437 	[BPF_ALU64 | BPF_DIV | BPF_K] =	div_imm64,
3438 	[BPF_ALU64 | BPF_NEG] =		neg_reg64,
3439 	[BPF_ALU64 | BPF_LSH | BPF_X] =	shl_reg64,
3440 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
3441 	[BPF_ALU64 | BPF_RSH | BPF_X] =	shr_reg64,
3442 	[BPF_ALU64 | BPF_RSH | BPF_K] =	shr_imm64,
3443 	[BPF_ALU64 | BPF_ARSH | BPF_X] = ashr_reg64,
3444 	[BPF_ALU64 | BPF_ARSH | BPF_K] = ashr_imm64,
3445 	[BPF_ALU | BPF_MOV | BPF_X] =	mov_reg,
3446 	[BPF_ALU | BPF_MOV | BPF_K] =	mov_imm,
3447 	[BPF_ALU | BPF_XOR | BPF_X] =	xor_reg,
3448 	[BPF_ALU | BPF_XOR | BPF_K] =	xor_imm,
3449 	[BPF_ALU | BPF_AND | BPF_X] =	and_reg,
3450 	[BPF_ALU | BPF_AND | BPF_K] =	and_imm,
3451 	[BPF_ALU | BPF_OR | BPF_X] =	or_reg,
3452 	[BPF_ALU | BPF_OR | BPF_K] =	or_imm,
3453 	[BPF_ALU | BPF_ADD | BPF_X] =	add_reg,
3454 	[BPF_ALU | BPF_ADD | BPF_K] =	add_imm,
3455 	[BPF_ALU | BPF_SUB | BPF_X] =	sub_reg,
3456 	[BPF_ALU | BPF_SUB | BPF_K] =	sub_imm,
3457 	[BPF_ALU | BPF_MUL | BPF_X] =	mul_reg,
3458 	[BPF_ALU | BPF_MUL | BPF_K] =	mul_imm,
3459 	[BPF_ALU | BPF_DIV | BPF_X] =	div_reg,
3460 	[BPF_ALU | BPF_DIV | BPF_K] =	div_imm,
3461 	[BPF_ALU | BPF_NEG] =		neg_reg,
3462 	[BPF_ALU | BPF_LSH | BPF_X] =	shl_reg,
3463 	[BPF_ALU | BPF_LSH | BPF_K] =	shl_imm,
3464 	[BPF_ALU | BPF_RSH | BPF_X] =	shr_reg,
3465 	[BPF_ALU | BPF_RSH | BPF_K] =	shr_imm,
3466 	[BPF_ALU | BPF_ARSH | BPF_X] =	ashr_reg,
3467 	[BPF_ALU | BPF_ARSH | BPF_K] =	ashr_imm,
3468 	[BPF_ALU | BPF_END | BPF_X] =	end_reg32,
3469 	[BPF_LD | BPF_IMM | BPF_DW] =	imm_ld8,
3470 	[BPF_LD | BPF_ABS | BPF_B] =	data_ld1,
3471 	[BPF_LD | BPF_ABS | BPF_H] =	data_ld2,
3472 	[BPF_LD | BPF_ABS | BPF_W] =	data_ld4,
3473 	[BPF_LD | BPF_IND | BPF_B] =	data_ind_ld1,
3474 	[BPF_LD | BPF_IND | BPF_H] =	data_ind_ld2,
3475 	[BPF_LD | BPF_IND | BPF_W] =	data_ind_ld4,
3476 	[BPF_LDX | BPF_MEM | BPF_B] =	mem_ldx1,
3477 	[BPF_LDX | BPF_MEM | BPF_H] =	mem_ldx2,
3478 	[BPF_LDX | BPF_MEM | BPF_W] =	mem_ldx4,
3479 	[BPF_LDX | BPF_MEM | BPF_DW] =	mem_ldx8,
3480 	[BPF_STX | BPF_MEM | BPF_B] =	mem_stx1,
3481 	[BPF_STX | BPF_MEM | BPF_H] =	mem_stx2,
3482 	[BPF_STX | BPF_MEM | BPF_W] =	mem_stx4,
3483 	[BPF_STX | BPF_MEM | BPF_DW] =	mem_stx8,
3484 	[BPF_STX | BPF_ATOMIC | BPF_W] =	mem_atomic4,
3485 	[BPF_STX | BPF_ATOMIC | BPF_DW] =	mem_atomic8,
3486 	[BPF_ST | BPF_MEM | BPF_B] =	mem_st1,
3487 	[BPF_ST | BPF_MEM | BPF_H] =	mem_st2,
3488 	[BPF_ST | BPF_MEM | BPF_W] =	mem_st4,
3489 	[BPF_ST | BPF_MEM | BPF_DW] =	mem_st8,
3490 	[BPF_JMP | BPF_JA | BPF_K] =	jump,
3491 	[BPF_JMP | BPF_JEQ | BPF_K] =	jeq_imm,
3492 	[BPF_JMP | BPF_JGT | BPF_K] =	cmp_imm,
3493 	[BPF_JMP | BPF_JGE | BPF_K] =	cmp_imm,
3494 	[BPF_JMP | BPF_JLT | BPF_K] =	cmp_imm,
3495 	[BPF_JMP | BPF_JLE | BPF_K] =	cmp_imm,
3496 	[BPF_JMP | BPF_JSGT | BPF_K] =  cmp_imm,
3497 	[BPF_JMP | BPF_JSGE | BPF_K] =  cmp_imm,
3498 	[BPF_JMP | BPF_JSLT | BPF_K] =  cmp_imm,
3499 	[BPF_JMP | BPF_JSLE | BPF_K] =  cmp_imm,
3500 	[BPF_JMP | BPF_JSET | BPF_K] =	jset_imm,
3501 	[BPF_JMP | BPF_JNE | BPF_K] =	jne_imm,
3502 	[BPF_JMP | BPF_JEQ | BPF_X] =	jeq_reg,
3503 	[BPF_JMP | BPF_JGT | BPF_X] =	cmp_reg,
3504 	[BPF_JMP | BPF_JGE | BPF_X] =	cmp_reg,
3505 	[BPF_JMP | BPF_JLT | BPF_X] =	cmp_reg,
3506 	[BPF_JMP | BPF_JLE | BPF_X] =	cmp_reg,
3507 	[BPF_JMP | BPF_JSGT | BPF_X] =  cmp_reg,
3508 	[BPF_JMP | BPF_JSGE | BPF_X] =  cmp_reg,
3509 	[BPF_JMP | BPF_JSLT | BPF_X] =  cmp_reg,
3510 	[BPF_JMP | BPF_JSLE | BPF_X] =  cmp_reg,
3511 	[BPF_JMP | BPF_JSET | BPF_X] =	jset_reg,
3512 	[BPF_JMP | BPF_JNE | BPF_X] =	jne_reg,
3513 	[BPF_JMP32 | BPF_JEQ | BPF_K] =	jeq32_imm,
3514 	[BPF_JMP32 | BPF_JGT | BPF_K] =	cmp_imm,
3515 	[BPF_JMP32 | BPF_JGE | BPF_K] =	cmp_imm,
3516 	[BPF_JMP32 | BPF_JLT | BPF_K] =	cmp_imm,
3517 	[BPF_JMP32 | BPF_JLE | BPF_K] =	cmp_imm,
3518 	[BPF_JMP32 | BPF_JSGT | BPF_K] =cmp_imm,
3519 	[BPF_JMP32 | BPF_JSGE | BPF_K] =cmp_imm,
3520 	[BPF_JMP32 | BPF_JSLT | BPF_K] =cmp_imm,
3521 	[BPF_JMP32 | BPF_JSLE | BPF_K] =cmp_imm,
3522 	[BPF_JMP32 | BPF_JSET | BPF_K] =jset_imm,
3523 	[BPF_JMP32 | BPF_JNE | BPF_K] =	jne_imm,
3524 	[BPF_JMP32 | BPF_JEQ | BPF_X] =	jeq_reg,
3525 	[BPF_JMP32 | BPF_JGT | BPF_X] =	cmp_reg,
3526 	[BPF_JMP32 | BPF_JGE | BPF_X] =	cmp_reg,
3527 	[BPF_JMP32 | BPF_JLT | BPF_X] =	cmp_reg,
3528 	[BPF_JMP32 | BPF_JLE | BPF_X] =	cmp_reg,
3529 	[BPF_JMP32 | BPF_JSGT | BPF_X] =cmp_reg,
3530 	[BPF_JMP32 | BPF_JSGE | BPF_X] =cmp_reg,
3531 	[BPF_JMP32 | BPF_JSLT | BPF_X] =cmp_reg,
3532 	[BPF_JMP32 | BPF_JSLE | BPF_X] =cmp_reg,
3533 	[BPF_JMP32 | BPF_JSET | BPF_X] =jset_reg,
3534 	[BPF_JMP32 | BPF_JNE | BPF_X] =	jne_reg,
3535 	[BPF_JMP | BPF_CALL] =		call,
3536 	[BPF_JMP | BPF_EXIT] =		jmp_exit,
3537 };
3538 
3539 /* --- Assembler logic --- */
3540 static int
3541 nfp_fixup_immed_relo(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
3542 		     struct nfp_insn_meta *jmp_dst, u32 br_idx)
3543 {
3544 	if (immed_get_value(nfp_prog->prog[br_idx + 1])) {
3545 		pr_err("BUG: failed to fix up callee register saving\n");
3546 		return -EINVAL;
3547 	}
3548 
3549 	immed_set_value(&nfp_prog->prog[br_idx + 1], jmp_dst->off);
3550 
3551 	return 0;
3552 }
3553 
3554 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
3555 {
3556 	struct nfp_insn_meta *meta, *jmp_dst;
3557 	u32 idx, br_idx;
3558 	int err;
3559 
3560 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3561 		if (meta->flags & FLAG_INSN_SKIP_MASK)
3562 			continue;
3563 		if (!is_mbpf_jmp(meta))
3564 			continue;
3565 		if (meta->insn.code == (BPF_JMP | BPF_EXIT) &&
3566 		    !nfp_is_main_function(meta))
3567 			continue;
3568 		if (is_mbpf_helper_call(meta))
3569 			continue;
3570 
3571 		if (list_is_last(&meta->l, &nfp_prog->insns))
3572 			br_idx = nfp_prog->last_bpf_off;
3573 		else
3574 			br_idx = list_next_entry(meta, l)->off - 1;
3575 
3576 		/* For BPF-to-BPF function call, a stack adjustment sequence is
3577 		 * generated after the return instruction. Therefore, we must
3578 		 * withdraw the length of this sequence to have br_idx pointing
3579 		 * to where the "branch" NFP instruction is expected to be.
3580 		 */
3581 		if (is_mbpf_pseudo_call(meta))
3582 			br_idx -= meta->num_insns_after_br;
3583 
3584 		if (!nfp_is_br(nfp_prog->prog[br_idx])) {
3585 			pr_err("Fixup found block not ending in branch %d %02x %016llx!!\n",
3586 			       br_idx, meta->insn.code, nfp_prog->prog[br_idx]);
3587 			return -ELOOP;
3588 		}
3589 
3590 		if (meta->insn.code == (BPF_JMP | BPF_EXIT))
3591 			continue;
3592 
3593 		/* Leave special branches for later */
3594 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
3595 		    RELO_BR_REL && !is_mbpf_pseudo_call(meta))
3596 			continue;
3597 
3598 		if (!meta->jmp_dst) {
3599 			pr_err("Non-exit jump doesn't have destination info recorded!!\n");
3600 			return -ELOOP;
3601 		}
3602 
3603 		jmp_dst = meta->jmp_dst;
3604 
3605 		if (jmp_dst->flags & FLAG_INSN_SKIP_PREC_DEPENDENT) {
3606 			pr_err("Branch landing on removed instruction!!\n");
3607 			return -ELOOP;
3608 		}
3609 
3610 		if (is_mbpf_pseudo_call(meta) &&
3611 		    nfp_prog->subprog[jmp_dst->subprog_idx].needs_reg_push) {
3612 			err = nfp_fixup_immed_relo(nfp_prog, meta,
3613 						   jmp_dst, br_idx);
3614 			if (err)
3615 				return err;
3616 		}
3617 
3618 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
3619 		    RELO_BR_REL)
3620 			continue;
3621 
3622 		for (idx = meta->off; idx <= br_idx; idx++) {
3623 			if (!nfp_is_br(nfp_prog->prog[idx]))
3624 				continue;
3625 			br_set_offset(&nfp_prog->prog[idx], jmp_dst->off);
3626 		}
3627 	}
3628 
3629 	return 0;
3630 }
3631 
3632 static void nfp_intro(struct nfp_prog *nfp_prog)
3633 {
3634 	wrp_immed(nfp_prog, plen_reg(nfp_prog), GENMASK(13, 0));
3635 	emit_alu(nfp_prog, plen_reg(nfp_prog),
3636 		 plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog));
3637 }
3638 
3639 static void
3640 nfp_subprog_prologue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3641 {
3642 	/* Save return address into the stack. */
3643 	wrp_mov(nfp_prog, reg_lm(0, 0), ret_reg(nfp_prog));
3644 }
3645 
3646 static void
3647 nfp_start_subprog(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3648 {
3649 	unsigned int depth = nfp_prog->subprog[meta->subprog_idx].stack_depth;
3650 
3651 	nfp_prog->stack_frame_depth = round_up(depth, 4);
3652 	nfp_subprog_prologue(nfp_prog, meta);
3653 }
3654 
3655 bool nfp_is_subprog_start(struct nfp_insn_meta *meta)
3656 {
3657 	return meta->flags & FLAG_INSN_IS_SUBPROG_START;
3658 }
3659 
3660 static void nfp_outro_tc_da(struct nfp_prog *nfp_prog)
3661 {
3662 	/* TC direct-action mode:
3663 	 *   0,1   ok        NOT SUPPORTED[1]
3664 	 *   2   drop  0x22 -> drop,  count as stat1
3665 	 *   4,5 nuke  0x02 -> drop
3666 	 *   7  redir  0x44 -> redir, count as stat2
3667 	 *   * unspec  0x11 -> pass,  count as stat0
3668 	 *
3669 	 * [1] We can't support OK and RECLASSIFY because we can't tell TC
3670 	 *     the exact decision made.  We are forced to support UNSPEC
3671 	 *     to handle aborts so that's the only one we handle for passing
3672 	 *     packets up the stack.
3673 	 */
3674 	/* Target for aborts */
3675 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3676 
3677 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3678 
3679 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3680 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x11), SHF_SC_L_SHF, 16);
3681 
3682 	/* Target for normal exits */
3683 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3684 
3685 	/* if R0 > 7 jump to abort */
3686 	emit_alu(nfp_prog, reg_none(), reg_imm(7), ALU_OP_SUB, reg_b(0));
3687 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3688 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3689 
3690 	wrp_immed(nfp_prog, reg_b(2), 0x41221211);
3691 	wrp_immed(nfp_prog, reg_b(3), 0x41001211);
3692 
3693 	emit_shf(nfp_prog, reg_a(1),
3694 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 2);
3695 
3696 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3697 	emit_shf(nfp_prog, reg_a(2),
3698 		 reg_imm(0xf), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3699 
3700 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3701 	emit_shf(nfp_prog, reg_b(2),
3702 		 reg_imm(0xf), SHF_OP_AND, reg_b(3), SHF_SC_R_SHF, 0);
3703 
3704 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3705 
3706 	emit_shf(nfp_prog, reg_b(2),
3707 		 reg_a(2), SHF_OP_OR, reg_b(2), SHF_SC_L_SHF, 4);
3708 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3709 }
3710 
3711 static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
3712 {
3713 	/* XDP return codes:
3714 	 *   0 aborted  0x82 -> drop,  count as stat3
3715 	 *   1    drop  0x22 -> drop,  count as stat1
3716 	 *   2    pass  0x11 -> pass,  count as stat0
3717 	 *   3      tx  0x44 -> redir, count as stat2
3718 	 *   * unknown  0x82 -> drop,  count as stat3
3719 	 */
3720 	/* Target for aborts */
3721 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3722 
3723 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3724 
3725 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3726 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x82), SHF_SC_L_SHF, 16);
3727 
3728 	/* Target for normal exits */
3729 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3730 
3731 	/* if R0 > 3 jump to abort */
3732 	emit_alu(nfp_prog, reg_none(), reg_imm(3), ALU_OP_SUB, reg_b(0));
3733 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3734 
3735 	wrp_immed(nfp_prog, reg_b(2), 0x44112282);
3736 
3737 	emit_shf(nfp_prog, reg_a(1),
3738 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 3);
3739 
3740 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3741 	emit_shf(nfp_prog, reg_b(2),
3742 		 reg_imm(0xff), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3743 
3744 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3745 
3746 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3747 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3748 }
3749 
3750 static bool nfp_prog_needs_callee_reg_save(struct nfp_prog *nfp_prog)
3751 {
3752 	unsigned int idx;
3753 
3754 	for (idx = 1; idx < nfp_prog->subprog_cnt; idx++)
3755 		if (nfp_prog->subprog[idx].needs_reg_push)
3756 			return true;
3757 
3758 	return false;
3759 }
3760 
3761 static void nfp_push_callee_registers(struct nfp_prog *nfp_prog)
3762 {
3763 	u8 reg;
3764 
3765 	/* Subroutine: Save all callee saved registers (R6 ~ R9).
3766 	 * imm_b() holds the return address.
3767 	 */
3768 	nfp_prog->tgt_call_push_regs = nfp_prog_current_offset(nfp_prog);
3769 	for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
3770 		u8 adj = (reg - BPF_REG_0) * 2;
3771 		u8 idx = (reg - BPF_REG_6) * 2;
3772 
3773 		/* The first slot in the stack frame is used to push the return
3774 		 * address in bpf_to_bpf_call(), start just after.
3775 		 */
3776 		wrp_mov(nfp_prog, reg_lm(0, 1 + idx), reg_b(adj));
3777 
3778 		if (reg == BPF_REG_8)
3779 			/* Prepare to jump back, last 3 insns use defer slots */
3780 			emit_rtn(nfp_prog, imm_b(nfp_prog), 3);
3781 
3782 		wrp_mov(nfp_prog, reg_lm(0, 1 + idx + 1), reg_b(adj + 1));
3783 	}
3784 }
3785 
3786 static void nfp_pop_callee_registers(struct nfp_prog *nfp_prog)
3787 {
3788 	u8 reg;
3789 
3790 	/* Subroutine: Restore all callee saved registers (R6 ~ R9).
3791 	 * ret_reg() holds the return address.
3792 	 */
3793 	nfp_prog->tgt_call_pop_regs = nfp_prog_current_offset(nfp_prog);
3794 	for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
3795 		u8 adj = (reg - BPF_REG_0) * 2;
3796 		u8 idx = (reg - BPF_REG_6) * 2;
3797 
3798 		/* The first slot in the stack frame holds the return address,
3799 		 * start popping just after that.
3800 		 */
3801 		wrp_mov(nfp_prog, reg_both(adj), reg_lm(0, 1 + idx));
3802 
3803 		if (reg == BPF_REG_8)
3804 			/* Prepare to jump back, last 3 insns use defer slots */
3805 			emit_rtn(nfp_prog, ret_reg(nfp_prog), 3);
3806 
3807 		wrp_mov(nfp_prog, reg_both(adj + 1), reg_lm(0, 1 + idx + 1));
3808 	}
3809 }
3810 
3811 static void nfp_outro(struct nfp_prog *nfp_prog)
3812 {
3813 	switch (nfp_prog->type) {
3814 	case BPF_PROG_TYPE_SCHED_CLS:
3815 		nfp_outro_tc_da(nfp_prog);
3816 		break;
3817 	case BPF_PROG_TYPE_XDP:
3818 		nfp_outro_xdp(nfp_prog);
3819 		break;
3820 	default:
3821 		WARN_ON(1);
3822 	}
3823 
3824 	if (!nfp_prog_needs_callee_reg_save(nfp_prog))
3825 		return;
3826 
3827 	nfp_push_callee_registers(nfp_prog);
3828 	nfp_pop_callee_registers(nfp_prog);
3829 }
3830 
3831 static int nfp_translate(struct nfp_prog *nfp_prog)
3832 {
3833 	struct nfp_insn_meta *meta;
3834 	unsigned int depth;
3835 	int err;
3836 
3837 	depth = nfp_prog->subprog[0].stack_depth;
3838 	nfp_prog->stack_frame_depth = round_up(depth, 4);
3839 
3840 	nfp_intro(nfp_prog);
3841 	if (nfp_prog->error)
3842 		return nfp_prog->error;
3843 
3844 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3845 		instr_cb_t cb = instr_cb[meta->insn.code];
3846 
3847 		meta->off = nfp_prog_current_offset(nfp_prog);
3848 
3849 		if (nfp_is_subprog_start(meta)) {
3850 			nfp_start_subprog(nfp_prog, meta);
3851 			if (nfp_prog->error)
3852 				return nfp_prog->error;
3853 		}
3854 
3855 		if (meta->flags & FLAG_INSN_SKIP_MASK) {
3856 			nfp_prog->n_translated++;
3857 			continue;
3858 		}
3859 
3860 		if (nfp_meta_has_prev(nfp_prog, meta) &&
3861 		    nfp_meta_prev(meta)->double_cb)
3862 			cb = nfp_meta_prev(meta)->double_cb;
3863 		if (!cb)
3864 			return -ENOENT;
3865 		err = cb(nfp_prog, meta);
3866 		if (err)
3867 			return err;
3868 		if (nfp_prog->error)
3869 			return nfp_prog->error;
3870 
3871 		nfp_prog->n_translated++;
3872 	}
3873 
3874 	nfp_prog->last_bpf_off = nfp_prog_current_offset(nfp_prog) - 1;
3875 
3876 	nfp_outro(nfp_prog);
3877 	if (nfp_prog->error)
3878 		return nfp_prog->error;
3879 
3880 	wrp_nops(nfp_prog, NFP_USTORE_PREFETCH_WINDOW);
3881 	if (nfp_prog->error)
3882 		return nfp_prog->error;
3883 
3884 	return nfp_fixup_branches(nfp_prog);
3885 }
3886 
3887 /* --- Optimizations --- */
3888 static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog)
3889 {
3890 	struct nfp_insn_meta *meta;
3891 
3892 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3893 		struct bpf_insn insn = meta->insn;
3894 
3895 		/* Programs converted from cBPF start with register xoring */
3896 		if (insn.code == (BPF_ALU64 | BPF_XOR | BPF_X) &&
3897 		    insn.src_reg == insn.dst_reg)
3898 			continue;
3899 
3900 		/* Programs start with R6 = R1 but we ignore the skb pointer */
3901 		if (insn.code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
3902 		    insn.src_reg == 1 && insn.dst_reg == 6)
3903 			meta->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
3904 
3905 		/* Return as soon as something doesn't match */
3906 		if (!(meta->flags & FLAG_INSN_SKIP_MASK))
3907 			return;
3908 	}
3909 }
3910 
3911 /* abs(insn.imm) will fit better into unrestricted reg immediate -
3912  * convert add/sub of a negative number into a sub/add of a positive one.
3913  */
3914 static void nfp_bpf_opt_neg_add_sub(struct nfp_prog *nfp_prog)
3915 {
3916 	struct nfp_insn_meta *meta;
3917 
3918 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3919 		struct bpf_insn insn = meta->insn;
3920 
3921 		if (meta->flags & FLAG_INSN_SKIP_MASK)
3922 			continue;
3923 
3924 		if (!is_mbpf_alu(meta) && !is_mbpf_jmp(meta))
3925 			continue;
3926 		if (BPF_SRC(insn.code) != BPF_K)
3927 			continue;
3928 		if (insn.imm >= 0)
3929 			continue;
3930 
3931 		if (is_mbpf_jmp(meta)) {
3932 			switch (BPF_OP(insn.code)) {
3933 			case BPF_JGE:
3934 			case BPF_JSGE:
3935 			case BPF_JLT:
3936 			case BPF_JSLT:
3937 				meta->jump_neg_op = true;
3938 				break;
3939 			default:
3940 				continue;
3941 			}
3942 		} else {
3943 			if (BPF_OP(insn.code) == BPF_ADD)
3944 				insn.code = BPF_CLASS(insn.code) | BPF_SUB;
3945 			else if (BPF_OP(insn.code) == BPF_SUB)
3946 				insn.code = BPF_CLASS(insn.code) | BPF_ADD;
3947 			else
3948 				continue;
3949 
3950 			meta->insn.code = insn.code | BPF_K;
3951 		}
3952 
3953 		meta->insn.imm = -insn.imm;
3954 	}
3955 }
3956 
3957 /* Remove masking after load since our load guarantees this is not needed */
3958 static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
3959 {
3960 	struct nfp_insn_meta *meta1, *meta2;
3961 	static const s32 exp_mask[] = {
3962 		[BPF_B] = 0x000000ffU,
3963 		[BPF_H] = 0x0000ffffU,
3964 		[BPF_W] = 0xffffffffU,
3965 	};
3966 
3967 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3968 		struct bpf_insn insn, next;
3969 
3970 		insn = meta1->insn;
3971 		next = meta2->insn;
3972 
3973 		if (BPF_CLASS(insn.code) != BPF_LD)
3974 			continue;
3975 		if (BPF_MODE(insn.code) != BPF_ABS &&
3976 		    BPF_MODE(insn.code) != BPF_IND)
3977 			continue;
3978 
3979 		if (next.code != (BPF_ALU64 | BPF_AND | BPF_K))
3980 			continue;
3981 
3982 		if (!exp_mask[BPF_SIZE(insn.code)])
3983 			continue;
3984 		if (exp_mask[BPF_SIZE(insn.code)] != next.imm)
3985 			continue;
3986 
3987 		if (next.src_reg || next.dst_reg)
3988 			continue;
3989 
3990 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST)
3991 			continue;
3992 
3993 		meta2->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
3994 	}
3995 }
3996 
3997 static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
3998 {
3999 	struct nfp_insn_meta *meta1, *meta2, *meta3;
4000 
4001 	nfp_for_each_insn_walk3(nfp_prog, meta1, meta2, meta3) {
4002 		struct bpf_insn insn, next1, next2;
4003 
4004 		insn = meta1->insn;
4005 		next1 = meta2->insn;
4006 		next2 = meta3->insn;
4007 
4008 		if (BPF_CLASS(insn.code) != BPF_LD)
4009 			continue;
4010 		if (BPF_MODE(insn.code) != BPF_ABS &&
4011 		    BPF_MODE(insn.code) != BPF_IND)
4012 			continue;
4013 		if (BPF_SIZE(insn.code) != BPF_W)
4014 			continue;
4015 
4016 		if (!(next1.code == (BPF_LSH | BPF_K | BPF_ALU64) &&
4017 		      next2.code == (BPF_RSH | BPF_K | BPF_ALU64)) &&
4018 		    !(next1.code == (BPF_RSH | BPF_K | BPF_ALU64) &&
4019 		      next2.code == (BPF_LSH | BPF_K | BPF_ALU64)))
4020 			continue;
4021 
4022 		if (next1.src_reg || next1.dst_reg ||
4023 		    next2.src_reg || next2.dst_reg)
4024 			continue;
4025 
4026 		if (next1.imm != 0x20 || next2.imm != 0x20)
4027 			continue;
4028 
4029 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST ||
4030 		    meta3->flags & FLAG_INSN_IS_JUMP_DST)
4031 			continue;
4032 
4033 		meta2->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4034 		meta3->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4035 	}
4036 }
4037 
4038 /* load/store pair that forms memory copy sould look like the following:
4039  *
4040  *   ld_width R, [addr_src + offset_src]
4041  *   st_width [addr_dest + offset_dest], R
4042  *
4043  * The destination register of load and source register of store should
4044  * be the same, load and store should also perform at the same width.
4045  * If either of addr_src or addr_dest is stack pointer, we don't do the
4046  * CPP optimization as stack is modelled by registers on NFP.
4047  */
4048 static bool
4049 curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
4050 		    struct nfp_insn_meta *st_meta)
4051 {
4052 	struct bpf_insn *ld = &ld_meta->insn;
4053 	struct bpf_insn *st = &st_meta->insn;
4054 
4055 	if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
4056 		return false;
4057 
4058 	if (ld_meta->ptr.type != PTR_TO_PACKET &&
4059 	    ld_meta->ptr.type != PTR_TO_MAP_VALUE)
4060 		return false;
4061 
4062 	if (st_meta->ptr.type != PTR_TO_PACKET)
4063 		return false;
4064 
4065 	if (BPF_SIZE(ld->code) != BPF_SIZE(st->code))
4066 		return false;
4067 
4068 	if (ld->dst_reg != st->src_reg)
4069 		return false;
4070 
4071 	/* There is jump to the store insn in this pair. */
4072 	if (st_meta->flags & FLAG_INSN_IS_JUMP_DST)
4073 		return false;
4074 
4075 	return true;
4076 }
4077 
4078 /* Currently, we only support chaining load/store pairs if:
4079  *
4080  *  - Their address base registers are the same.
4081  *  - Their address offsets are in the same order.
4082  *  - They operate at the same memory width.
4083  *  - There is no jump into the middle of them.
4084  */
4085 static bool
4086 curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta,
4087 			      struct nfp_insn_meta *st_meta,
4088 			      struct bpf_insn *prev_ld,
4089 			      struct bpf_insn *prev_st)
4090 {
4091 	u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst;
4092 	struct bpf_insn *ld = &ld_meta->insn;
4093 	struct bpf_insn *st = &st_meta->insn;
4094 	s16 prev_ld_off, prev_st_off;
4095 
4096 	/* This pair is the start pair. */
4097 	if (!prev_ld)
4098 		return true;
4099 
4100 	prev_size = BPF_LDST_BYTES(prev_ld);
4101 	curr_size = BPF_LDST_BYTES(ld);
4102 	prev_ld_base = prev_ld->src_reg;
4103 	prev_st_base = prev_st->dst_reg;
4104 	prev_ld_dst = prev_ld->dst_reg;
4105 	prev_ld_off = prev_ld->off;
4106 	prev_st_off = prev_st->off;
4107 
4108 	if (ld->dst_reg != prev_ld_dst)
4109 		return false;
4110 
4111 	if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base)
4112 		return false;
4113 
4114 	if (curr_size != prev_size)
4115 		return false;
4116 
4117 	/* There is jump to the head of this pair. */
4118 	if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST)
4119 		return false;
4120 
4121 	/* Both in ascending order. */
4122 	if (prev_ld_off + prev_size == ld->off &&
4123 	    prev_st_off + prev_size == st->off)
4124 		return true;
4125 
4126 	/* Both in descending order. */
4127 	if (ld->off + curr_size == prev_ld_off &&
4128 	    st->off + curr_size == prev_st_off)
4129 		return true;
4130 
4131 	return false;
4132 }
4133 
4134 /* Return TRUE if cross memory access happens. Cross memory access means
4135  * store area is overlapping with load area that a later load might load
4136  * the value from previous store, for this case we can't treat the sequence
4137  * as an memory copy.
4138  */
4139 static bool
4140 cross_mem_access(struct bpf_insn *ld, struct nfp_insn_meta *head_ld_meta,
4141 		 struct nfp_insn_meta *head_st_meta)
4142 {
4143 	s16 head_ld_off, head_st_off, ld_off;
4144 
4145 	/* Different pointer types does not overlap. */
4146 	if (head_ld_meta->ptr.type != head_st_meta->ptr.type)
4147 		return false;
4148 
4149 	/* load and store are both PTR_TO_PACKET, check ID info.  */
4150 	if (head_ld_meta->ptr.id != head_st_meta->ptr.id)
4151 		return true;
4152 
4153 	/* Canonicalize the offsets. Turn all of them against the original
4154 	 * base register.
4155 	 */
4156 	head_ld_off = head_ld_meta->insn.off + head_ld_meta->ptr.off;
4157 	head_st_off = head_st_meta->insn.off + head_st_meta->ptr.off;
4158 	ld_off = ld->off + head_ld_meta->ptr.off;
4159 
4160 	/* Ascending order cross. */
4161 	if (ld_off > head_ld_off &&
4162 	    head_ld_off < head_st_off && ld_off >= head_st_off)
4163 		return true;
4164 
4165 	/* Descending order cross. */
4166 	if (ld_off < head_ld_off &&
4167 	    head_ld_off > head_st_off && ld_off <= head_st_off)
4168 		return true;
4169 
4170 	return false;
4171 }
4172 
4173 /* This pass try to identify the following instructoin sequences.
4174  *
4175  *   load R, [regA + offA]
4176  *   store [regB + offB], R
4177  *   load R, [regA + offA + const_imm_A]
4178  *   store [regB + offB + const_imm_A], R
4179  *   load R, [regA + offA + 2 * const_imm_A]
4180  *   store [regB + offB + 2 * const_imm_A], R
4181  *   ...
4182  *
4183  * Above sequence is typically generated by compiler when lowering
4184  * memcpy. NFP prefer using CPP instructions to accelerate it.
4185  */
4186 static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog)
4187 {
4188 	struct nfp_insn_meta *head_ld_meta = NULL;
4189 	struct nfp_insn_meta *head_st_meta = NULL;
4190 	struct nfp_insn_meta *meta1, *meta2;
4191 	struct bpf_insn *prev_ld = NULL;
4192 	struct bpf_insn *prev_st = NULL;
4193 	u8 count = 0;
4194 
4195 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
4196 		struct bpf_insn *ld = &meta1->insn;
4197 		struct bpf_insn *st = &meta2->insn;
4198 
4199 		/* Reset record status if any of the following if true:
4200 		 *   - The current insn pair is not load/store.
4201 		 *   - The load/store pair doesn't chain with previous one.
4202 		 *   - The chained load/store pair crossed with previous pair.
4203 		 *   - The chained load/store pair has a total size of memory
4204 		 *     copy beyond 128 bytes which is the maximum length a
4205 		 *     single NFP CPP command can transfer.
4206 		 */
4207 		if (!curr_pair_is_memcpy(meta1, meta2) ||
4208 		    !curr_pair_chain_with_previous(meta1, meta2, prev_ld,
4209 						   prev_st) ||
4210 		    (head_ld_meta && (cross_mem_access(ld, head_ld_meta,
4211 						       head_st_meta) ||
4212 				      head_ld_meta->ldst_gather_len >= 128))) {
4213 			if (!count)
4214 				continue;
4215 
4216 			if (count > 1) {
4217 				s16 prev_ld_off = prev_ld->off;
4218 				s16 prev_st_off = prev_st->off;
4219 				s16 head_ld_off = head_ld_meta->insn.off;
4220 
4221 				if (prev_ld_off < head_ld_off) {
4222 					head_ld_meta->insn.off = prev_ld_off;
4223 					head_st_meta->insn.off = prev_st_off;
4224 					head_ld_meta->ldst_gather_len =
4225 						-head_ld_meta->ldst_gather_len;
4226 				}
4227 
4228 				head_ld_meta->paired_st = &head_st_meta->insn;
4229 				head_st_meta->flags |=
4230 					FLAG_INSN_SKIP_PREC_DEPENDENT;
4231 			} else {
4232 				head_ld_meta->ldst_gather_len = 0;
4233 			}
4234 
4235 			/* If the chain is ended by an load/store pair then this
4236 			 * could serve as the new head of the next chain.
4237 			 */
4238 			if (curr_pair_is_memcpy(meta1, meta2)) {
4239 				head_ld_meta = meta1;
4240 				head_st_meta = meta2;
4241 				head_ld_meta->ldst_gather_len =
4242 					BPF_LDST_BYTES(ld);
4243 				meta1 = nfp_meta_next(meta1);
4244 				meta2 = nfp_meta_next(meta2);
4245 				prev_ld = ld;
4246 				prev_st = st;
4247 				count = 1;
4248 			} else {
4249 				head_ld_meta = NULL;
4250 				head_st_meta = NULL;
4251 				prev_ld = NULL;
4252 				prev_st = NULL;
4253 				count = 0;
4254 			}
4255 
4256 			continue;
4257 		}
4258 
4259 		if (!head_ld_meta) {
4260 			head_ld_meta = meta1;
4261 			head_st_meta = meta2;
4262 		} else {
4263 			meta1->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4264 			meta2->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4265 		}
4266 
4267 		head_ld_meta->ldst_gather_len += BPF_LDST_BYTES(ld);
4268 		meta1 = nfp_meta_next(meta1);
4269 		meta2 = nfp_meta_next(meta2);
4270 		prev_ld = ld;
4271 		prev_st = st;
4272 		count++;
4273 	}
4274 }
4275 
4276 static void nfp_bpf_opt_pkt_cache(struct nfp_prog *nfp_prog)
4277 {
4278 	struct nfp_insn_meta *meta, *range_node = NULL;
4279 	s16 range_start = 0, range_end = 0;
4280 	bool cache_avail = false;
4281 	struct bpf_insn *insn;
4282 	s32 range_ptr_off = 0;
4283 	u32 range_ptr_id = 0;
4284 
4285 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4286 		if (meta->flags & FLAG_INSN_IS_JUMP_DST)
4287 			cache_avail = false;
4288 
4289 		if (meta->flags & FLAG_INSN_SKIP_MASK)
4290 			continue;
4291 
4292 		insn = &meta->insn;
4293 
4294 		if (is_mbpf_store_pkt(meta) ||
4295 		    insn->code == (BPF_JMP | BPF_CALL) ||
4296 		    is_mbpf_classic_store_pkt(meta) ||
4297 		    is_mbpf_classic_load(meta)) {
4298 			cache_avail = false;
4299 			continue;
4300 		}
4301 
4302 		if (!is_mbpf_load(meta))
4303 			continue;
4304 
4305 		if (meta->ptr.type != PTR_TO_PACKET || meta->ldst_gather_len) {
4306 			cache_avail = false;
4307 			continue;
4308 		}
4309 
4310 		if (!cache_avail) {
4311 			cache_avail = true;
4312 			if (range_node)
4313 				goto end_current_then_start_new;
4314 			goto start_new;
4315 		}
4316 
4317 		/* Check ID to make sure two reads share the same
4318 		 * variable offset against PTR_TO_PACKET, and check OFF
4319 		 * to make sure they also share the same constant
4320 		 * offset.
4321 		 *
4322 		 * OFFs don't really need to be the same, because they
4323 		 * are the constant offsets against PTR_TO_PACKET, so
4324 		 * for different OFFs, we could canonicalize them to
4325 		 * offsets against original packet pointer. We don't
4326 		 * support this.
4327 		 */
4328 		if (meta->ptr.id == range_ptr_id &&
4329 		    meta->ptr.off == range_ptr_off) {
4330 			s16 new_start = range_start;
4331 			s16 end, off = insn->off;
4332 			s16 new_end = range_end;
4333 			bool changed = false;
4334 
4335 			if (off < range_start) {
4336 				new_start = off;
4337 				changed = true;
4338 			}
4339 
4340 			end = off + BPF_LDST_BYTES(insn);
4341 			if (end > range_end) {
4342 				new_end = end;
4343 				changed = true;
4344 			}
4345 
4346 			if (!changed)
4347 				continue;
4348 
4349 			if (new_end - new_start <= 64) {
4350 				/* Install new range. */
4351 				range_start = new_start;
4352 				range_end = new_end;
4353 				continue;
4354 			}
4355 		}
4356 
4357 end_current_then_start_new:
4358 		range_node->pkt_cache.range_start = range_start;
4359 		range_node->pkt_cache.range_end = range_end;
4360 start_new:
4361 		range_node = meta;
4362 		range_node->pkt_cache.do_init = true;
4363 		range_ptr_id = range_node->ptr.id;
4364 		range_ptr_off = range_node->ptr.off;
4365 		range_start = insn->off;
4366 		range_end = insn->off + BPF_LDST_BYTES(insn);
4367 	}
4368 
4369 	if (range_node) {
4370 		range_node->pkt_cache.range_start = range_start;
4371 		range_node->pkt_cache.range_end = range_end;
4372 	}
4373 
4374 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4375 		if (meta->flags & FLAG_INSN_SKIP_MASK)
4376 			continue;
4377 
4378 		if (is_mbpf_load_pkt(meta) && !meta->ldst_gather_len) {
4379 			if (meta->pkt_cache.do_init) {
4380 				range_start = meta->pkt_cache.range_start;
4381 				range_end = meta->pkt_cache.range_end;
4382 			} else {
4383 				meta->pkt_cache.range_start = range_start;
4384 				meta->pkt_cache.range_end = range_end;
4385 			}
4386 		}
4387 	}
4388 }
4389 
4390 static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
4391 {
4392 	nfp_bpf_opt_reg_init(nfp_prog);
4393 
4394 	nfp_bpf_opt_neg_add_sub(nfp_prog);
4395 	nfp_bpf_opt_ld_mask(nfp_prog);
4396 	nfp_bpf_opt_ld_shift(nfp_prog);
4397 	nfp_bpf_opt_ldst_gather(nfp_prog);
4398 	nfp_bpf_opt_pkt_cache(nfp_prog);
4399 
4400 	return 0;
4401 }
4402 
4403 static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog)
4404 {
4405 	struct nfp_insn_meta *meta1, *meta2;
4406 	struct nfp_bpf_map *nfp_map;
4407 	struct bpf_map *map;
4408 	u32 id;
4409 
4410 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
4411 		if (meta1->flags & FLAG_INSN_SKIP_MASK ||
4412 		    meta2->flags & FLAG_INSN_SKIP_MASK)
4413 			continue;
4414 
4415 		if (meta1->insn.code != (BPF_LD | BPF_IMM | BPF_DW) ||
4416 		    meta1->insn.src_reg != BPF_PSEUDO_MAP_FD)
4417 			continue;
4418 
4419 		map = (void *)(unsigned long)((u32)meta1->insn.imm |
4420 					      (u64)meta2->insn.imm << 32);
4421 		if (bpf_map_offload_neutral(map)) {
4422 			id = map->id;
4423 		} else {
4424 			nfp_map = map_to_offmap(map)->dev_priv;
4425 			id = nfp_map->tid;
4426 		}
4427 
4428 		meta1->insn.imm = id;
4429 		meta2->insn.imm = 0;
4430 	}
4431 
4432 	return 0;
4433 }
4434 
4435 static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len)
4436 {
4437 	__le64 *ustore = (__force __le64 *)prog;
4438 	int i;
4439 
4440 	for (i = 0; i < len; i++) {
4441 		int err;
4442 
4443 		err = nfp_ustore_check_valid_no_ecc(prog[i]);
4444 		if (err)
4445 			return err;
4446 
4447 		ustore[i] = cpu_to_le64(nfp_ustore_calc_ecc_insn(prog[i]));
4448 	}
4449 
4450 	return 0;
4451 }
4452 
4453 static void nfp_bpf_prog_trim(struct nfp_prog *nfp_prog)
4454 {
4455 	void *prog;
4456 
4457 	prog = kvmalloc_array(nfp_prog->prog_len, sizeof(u64), GFP_KERNEL);
4458 	if (!prog)
4459 		return;
4460 
4461 	nfp_prog->__prog_alloc_len = nfp_prog->prog_len * sizeof(u64);
4462 	memcpy(prog, nfp_prog->prog, nfp_prog->__prog_alloc_len);
4463 	kvfree(nfp_prog->prog);
4464 	nfp_prog->prog = prog;
4465 }
4466 
4467 int nfp_bpf_jit(struct nfp_prog *nfp_prog)
4468 {
4469 	int ret;
4470 
4471 	ret = nfp_bpf_replace_map_ptrs(nfp_prog);
4472 	if (ret)
4473 		return ret;
4474 
4475 	ret = nfp_bpf_optimize(nfp_prog);
4476 	if (ret)
4477 		return ret;
4478 
4479 	ret = nfp_translate(nfp_prog);
4480 	if (ret) {
4481 		pr_err("Translation failed with error %d (translated: %u)\n",
4482 		       ret, nfp_prog->n_translated);
4483 		return -EINVAL;
4484 	}
4485 
4486 	nfp_bpf_prog_trim(nfp_prog);
4487 
4488 	return ret;
4489 }
4490 
4491 void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog)
4492 {
4493 	struct nfp_insn_meta *meta;
4494 
4495 	/* Another pass to record jump information. */
4496 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4497 		struct nfp_insn_meta *dst_meta;
4498 		u64 code = meta->insn.code;
4499 		unsigned int dst_idx;
4500 		bool pseudo_call;
4501 
4502 		if (!is_mbpf_jmp(meta))
4503 			continue;
4504 		if (BPF_OP(code) == BPF_EXIT)
4505 			continue;
4506 		if (is_mbpf_helper_call(meta))
4507 			continue;
4508 
4509 		/* If opcode is BPF_CALL at this point, this can only be a
4510 		 * BPF-to-BPF call (a.k.a pseudo call).
4511 		 */
4512 		pseudo_call = BPF_OP(code) == BPF_CALL;
4513 
4514 		if (pseudo_call)
4515 			dst_idx = meta->n + 1 + meta->insn.imm;
4516 		else
4517 			dst_idx = meta->n + 1 + meta->insn.off;
4518 
4519 		dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_idx);
4520 
4521 		if (pseudo_call)
4522 			dst_meta->flags |= FLAG_INSN_IS_SUBPROG_START;
4523 
4524 		dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
4525 		meta->jmp_dst = dst_meta;
4526 	}
4527 }
4528 
4529 bool nfp_bpf_supported_opcode(u8 code)
4530 {
4531 	return !!instr_cb[code];
4532 }
4533 
4534 void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
4535 {
4536 	unsigned int i;
4537 	u64 *prog;
4538 	int err;
4539 
4540 	prog = kmemdup(nfp_prog->prog, nfp_prog->prog_len * sizeof(u64),
4541 		       GFP_KERNEL);
4542 	if (!prog)
4543 		return ERR_PTR(-ENOMEM);
4544 
4545 	for (i = 0; i < nfp_prog->prog_len; i++) {
4546 		enum nfp_relo_type special;
4547 		u32 val;
4548 		u16 off;
4549 
4550 		special = FIELD_GET(OP_RELO_TYPE, prog[i]);
4551 		switch (special) {
4552 		case RELO_NONE:
4553 			continue;
4554 		case RELO_BR_REL:
4555 			br_add_offset(&prog[i], bv->start_off);
4556 			break;
4557 		case RELO_BR_GO_OUT:
4558 			br_set_offset(&prog[i],
4559 				      nfp_prog->tgt_out + bv->start_off);
4560 			break;
4561 		case RELO_BR_GO_ABORT:
4562 			br_set_offset(&prog[i],
4563 				      nfp_prog->tgt_abort + bv->start_off);
4564 			break;
4565 		case RELO_BR_GO_CALL_PUSH_REGS:
4566 			if (!nfp_prog->tgt_call_push_regs) {
4567 				pr_err("BUG: failed to detect subprogram registers needs\n");
4568 				err = -EINVAL;
4569 				goto err_free_prog;
4570 			}
4571 			off = nfp_prog->tgt_call_push_regs + bv->start_off;
4572 			br_set_offset(&prog[i], off);
4573 			break;
4574 		case RELO_BR_GO_CALL_POP_REGS:
4575 			if (!nfp_prog->tgt_call_pop_regs) {
4576 				pr_err("BUG: failed to detect subprogram registers needs\n");
4577 				err = -EINVAL;
4578 				goto err_free_prog;
4579 			}
4580 			off = nfp_prog->tgt_call_pop_regs + bv->start_off;
4581 			br_set_offset(&prog[i], off);
4582 			break;
4583 		case RELO_BR_NEXT_PKT:
4584 			br_set_offset(&prog[i], bv->tgt_done);
4585 			break;
4586 		case RELO_BR_HELPER:
4587 			val = br_get_offset(prog[i]);
4588 			val -= BR_OFF_RELO;
4589 			switch (val) {
4590 			case BPF_FUNC_map_lookup_elem:
4591 				val = nfp_prog->bpf->helpers.map_lookup;
4592 				break;
4593 			case BPF_FUNC_map_update_elem:
4594 				val = nfp_prog->bpf->helpers.map_update;
4595 				break;
4596 			case BPF_FUNC_map_delete_elem:
4597 				val = nfp_prog->bpf->helpers.map_delete;
4598 				break;
4599 			case BPF_FUNC_perf_event_output:
4600 				val = nfp_prog->bpf->helpers.perf_event_output;
4601 				break;
4602 			default:
4603 				pr_err("relocation of unknown helper %d\n",
4604 				       val);
4605 				err = -EINVAL;
4606 				goto err_free_prog;
4607 			}
4608 			br_set_offset(&prog[i], val);
4609 			break;
4610 		case RELO_IMMED_REL:
4611 			immed_add_value(&prog[i], bv->start_off);
4612 			break;
4613 		}
4614 
4615 		prog[i] &= ~OP_RELO_TYPE;
4616 	}
4617 
4618 	err = nfp_bpf_ustore_calc(prog, nfp_prog->prog_len);
4619 	if (err)
4620 		goto err_free_prog;
4621 
4622 	return prog;
4623 
4624 err_free_prog:
4625 	kfree(prog);
4626 	return ERR_PTR(err);
4627 }
4628