1/**
2 * Author......: See docs/credits.txt
3 * License.....: MIT
4 */
5
6#include "inc_vendor.h"
7#include "inc_types.h"
8#include "inc_platform.h"
9#include "inc_common.h"
10#include "inc_rp_optimized.h"
11
12#ifndef MAYBE_UNUSED
13#define MAYBE_UNUSED
14#endif
15
16DECLSPEC u32 generate_cmask_optimized (const u32 value)
17{
18  const u32 rmask =  ((value & 0x40404040u) >> 1u)
19                  & ~((value & 0x80808080u) >> 2u);
20
21  const u32 hmask = (value & 0x1f1f1f1fu) + 0x05050505u;
22  const u32 lmask = (value & 0x1f1f1f1fu) + 0x1f1f1f1fu;
23
24  return rmask & ~hmask & lmask;
25}
26
27DECLSPEC void truncate_right_optimized (u32 *buf0, u32 *buf1, const u32 offset)
28{
29  const u32 tmp = (1u << ((offset & 3u) * 8u)) - 1u;
30
31  const int offset_switch = offset / 4;
32
33  switch (offset_switch)
34  {
35    case  0:  buf0[0] &= tmp;
36              buf0[1]  = 0;
37              buf0[2]  = 0;
38              buf0[3]  = 0;
39              buf1[0]  = 0;
40              buf1[1]  = 0;
41              buf1[2]  = 0;
42              buf1[3]  = 0;
43              break;
44    case  1:  buf0[1] &= tmp;
45              buf0[2]  = 0;
46              buf0[3]  = 0;
47              buf1[0]  = 0;
48              buf1[1]  = 0;
49              buf1[2]  = 0;
50              buf1[3]  = 0;
51              break;
52    case  2:  buf0[2] &= tmp;
53              buf0[3]  = 0;
54              buf1[0]  = 0;
55              buf1[1]  = 0;
56              buf1[2]  = 0;
57              buf1[3]  = 0;
58              break;
59    case  3:  buf0[3] &= tmp;
60              buf1[0]  = 0;
61              buf1[1]  = 0;
62              buf1[2]  = 0;
63              buf1[3]  = 0;
64              break;
65    case  4:  buf1[0] &= tmp;
66              buf1[1]  = 0;
67              buf1[2]  = 0;
68              buf1[3]  = 0;
69              break;
70    case  5:  buf1[1] &= tmp;
71              buf1[2]  = 0;
72              buf1[3]  = 0;
73              break;
74    case  6:  buf1[2] &= tmp;
75              buf1[3]  = 0;
76              break;
77    case  7:  buf1[3] &= tmp;
78              break;
79  }
80}
81
82DECLSPEC void truncate_left_optimized (u32 *buf0, u32 *buf1, const u32 offset)
83{
84  const u32 tmp = ~((1u << ((offset & 3u) * 8u)) - 1u);
85
86  const int offset_switch = offset / 4;
87
88  switch (offset_switch)
89  {
90    case  0:  buf0[0] &= tmp;
91              break;
92    case  1:  buf0[0]  = 0;
93              buf0[1] &= tmp;
94              break;
95    case  2:  buf0[0]  = 0;
96              buf0[1]  = 0;
97              buf0[2] &= tmp;
98              break;
99    case  3:  buf0[0]  = 0;
100              buf0[1]  = 0;
101              buf0[2]  = 0;
102              buf0[3] &= tmp;
103              break;
104    case  4:  buf0[0]  = 0;
105              buf0[1]  = 0;
106              buf0[2]  = 0;
107              buf0[3]  = 0;
108              buf1[0] &= tmp;
109              break;
110    case  5:  buf0[0]  = 0;
111              buf0[1]  = 0;
112              buf0[2]  = 0;
113              buf0[3]  = 0;
114              buf1[0]  = 0;
115              buf1[1] &= tmp;
116              break;
117    case  6:  buf0[0]  = 0;
118              buf0[1]  = 0;
119              buf0[2]  = 0;
120              buf0[3]  = 0;
121              buf1[0]  = 0;
122              buf1[1]  = 0;
123              buf1[2] &= tmp;
124              break;
125    case  7:  buf0[0]  = 0;
126              buf0[1]  = 0;
127              buf0[2]  = 0;
128              buf0[3]  = 0;
129              buf1[0]  = 0;
130              buf1[1]  = 0;
131              buf1[2]  = 0;
132              buf1[3] &= tmp;
133              break;
134  }
135}
136
137DECLSPEC void lshift_block_optimized (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1)
138{
139  out0[0] = hc_bytealign_S (in0[0], in0[1], 3);
140  out0[1] = hc_bytealign_S (in0[1], in0[2], 3);
141  out0[2] = hc_bytealign_S (in0[2], in0[3], 3);
142  out0[3] = hc_bytealign_S (in0[3], in1[0], 3);
143  out1[0] = hc_bytealign_S (in1[0], in1[1], 3);
144  out1[1] = hc_bytealign_S (in1[1], in1[2], 3);
145  out1[2] = hc_bytealign_S (in1[2], in1[3], 3);
146  out1[3] = hc_bytealign_S (in1[3],      0, 3);
147}
148
149DECLSPEC void rshift_block_optimized (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1)
150{
151  out1[3] = hc_bytealign_S (in1[2], in1[3], 1);
152  out1[2] = hc_bytealign_S (in1[1], in1[2], 1);
153  out1[1] = hc_bytealign_S (in1[0], in1[1], 1);
154  out1[0] = hc_bytealign_S (in0[3], in1[0], 1);
155  out0[3] = hc_bytealign_S (in0[2], in0[3], 1);
156  out0[2] = hc_bytealign_S (in0[1], in0[2], 1);
157  out0[1] = hc_bytealign_S (in0[0], in0[1], 1);
158  out0[0] = hc_bytealign_S (     0, in0[0], 1);
159}
160
161DECLSPEC void lshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const u32 num)
162{
163  switch (num)
164  {
165    case  0:  out0[0] = in0[0];
166              out0[1] = in0[1];
167              out0[2] = in0[2];
168              out0[3] = in0[3];
169              out1[0] = in1[0];
170              out1[1] = in1[1];
171              out1[2] = in1[2];
172              out1[3] = in1[3];
173              break;
174    case  1:  out0[0] = hc_bytealign_S (in0[0], in0[1], 3);
175              out0[1] = hc_bytealign_S (in0[1], in0[2], 3);
176              out0[2] = hc_bytealign_S (in0[2], in0[3], 3);
177              out0[3] = hc_bytealign_S (in0[3], in1[0], 3);
178              out1[0] = hc_bytealign_S (in1[0], in1[1], 3);
179              out1[1] = hc_bytealign_S (in1[1], in1[2], 3);
180              out1[2] = hc_bytealign_S (in1[2], in1[3], 3);
181              out1[3] = hc_bytealign_S (in1[3],      0, 3);
182              break;
183    case  2:  out0[0] = hc_bytealign_S (in0[0], in0[1], 2);
184              out0[1] = hc_bytealign_S (in0[1], in0[2], 2);
185              out0[2] = hc_bytealign_S (in0[2], in0[3], 2);
186              out0[3] = hc_bytealign_S (in0[3], in1[0], 2);
187              out1[0] = hc_bytealign_S (in1[0], in1[1], 2);
188              out1[1] = hc_bytealign_S (in1[1], in1[2], 2);
189              out1[2] = hc_bytealign_S (in1[2], in1[3], 2);
190              out1[3] = hc_bytealign_S (in1[3],      0, 2);
191              break;
192    case  3:  out0[0] = hc_bytealign_S (in0[0], in0[1], 1);
193              out0[1] = hc_bytealign_S (in0[1], in0[2], 1);
194              out0[2] = hc_bytealign_S (in0[2], in0[3], 1);
195              out0[3] = hc_bytealign_S (in0[3], in1[0], 1);
196              out1[0] = hc_bytealign_S (in1[0], in1[1], 1);
197              out1[1] = hc_bytealign_S (in1[1], in1[2], 1);
198              out1[2] = hc_bytealign_S (in1[2], in1[3], 1);
199              out1[3] = hc_bytealign_S (in1[3],      0, 1);
200              break;
201    case  4:  out0[0] = in0[1];
202              out0[1] = in0[2];
203              out0[2] = in0[3];
204              out0[3] = in1[0];
205              out1[0] = in1[1];
206              out1[1] = in1[2];
207              out1[2] = in1[3];
208              out1[3] = 0;
209              break;
210    case  5:  out0[0] = hc_bytealign_S (in0[1], in0[2], 3);
211              out0[1] = hc_bytealign_S (in0[2], in0[3], 3);
212              out0[2] = hc_bytealign_S (in0[3], in1[0], 3);
213              out0[3] = hc_bytealign_S (in1[0], in1[1], 3);
214              out1[0] = hc_bytealign_S (in1[1], in1[2], 3);
215              out1[1] = hc_bytealign_S (in1[2], in1[3], 3);
216              out1[2] = hc_bytealign_S (in1[3],      0, 3);
217              out1[3] = 0;
218              break;
219    case  6:  out0[0] = hc_bytealign_S (in0[1], in0[2], 2);
220              out0[1] = hc_bytealign_S (in0[2], in0[3], 2);
221              out0[2] = hc_bytealign_S (in0[3], in1[0], 2);
222              out0[3] = hc_bytealign_S (in1[0], in1[1], 2);
223              out1[0] = hc_bytealign_S (in1[1], in1[2], 2);
224              out1[1] = hc_bytealign_S (in1[2], in1[3], 2);
225              out1[2] = hc_bytealign_S (in1[3],      0, 2);
226              out1[3] = 0;
227              break;
228    case  7:  out0[0] = hc_bytealign_S (in0[1], in0[2], 1);
229              out0[1] = hc_bytealign_S (in0[2], in0[3], 1);
230              out0[2] = hc_bytealign_S (in0[3], in1[0], 1);
231              out0[3] = hc_bytealign_S (in1[0], in1[1], 1);
232              out1[0] = hc_bytealign_S (in1[1], in1[2], 1);
233              out1[1] = hc_bytealign_S (in1[2], in1[3], 1);
234              out1[2] = hc_bytealign_S (in1[3],      0, 1);
235              out1[3] = 0;
236              break;
237    case  8:  out0[0] = in0[2];
238              out0[1] = in0[3];
239              out0[2] = in1[0];
240              out0[3] = in1[1];
241              out1[0] = in1[2];
242              out1[1] = in1[3];
243              out1[2] = 0;
244              out1[3] = 0;
245              break;
246    case  9:  out0[0] = hc_bytealign_S (in0[2], in0[3], 3);
247              out0[1] = hc_bytealign_S (in0[3], in1[0], 3);
248              out0[2] = hc_bytealign_S (in1[0], in1[1], 3);
249              out0[3] = hc_bytealign_S (in1[1], in1[2], 3);
250              out1[0] = hc_bytealign_S (in1[2], in1[3], 3);
251              out1[1] = hc_bytealign_S (in1[3],      0, 3);
252              out1[2] = 0;
253              out1[3] = 0;
254              break;
255    case 10:  out0[0] = hc_bytealign_S (in0[2], in0[3], 2);
256              out0[1] = hc_bytealign_S (in0[3], in1[0], 2);
257              out0[2] = hc_bytealign_S (in1[0], in1[1], 2);
258              out0[3] = hc_bytealign_S (in1[1], in1[2], 2);
259              out1[0] = hc_bytealign_S (in1[2], in1[3], 2);
260              out1[1] = hc_bytealign_S (in1[3],      0, 2);
261              out1[2] = 0;
262              out1[3] = 0;
263              break;
264    case 11:  out0[0] = hc_bytealign_S (in0[2], in0[3], 1);
265              out0[1] = hc_bytealign_S (in0[3], in1[0], 1);
266              out0[2] = hc_bytealign_S (in1[0], in1[1], 1);
267              out0[3] = hc_bytealign_S (in1[1], in1[2], 1);
268              out1[0] = hc_bytealign_S (in1[2], in1[3], 1);
269              out1[1] = hc_bytealign_S (in1[3],      0, 1);
270              out1[2] = 0;
271              out1[3] = 0;
272              break;
273    case 12:  out0[0] = in0[3];
274              out0[1] = in1[0];
275              out0[2] = in1[1];
276              out0[3] = in1[2];
277              out1[0] = in1[3];
278              out1[1] = 0;
279              out1[2] = 0;
280              out1[3] = 0;
281              break;
282    case 13:  out0[0] = hc_bytealign_S (in0[3], in1[0], 3);
283              out0[1] = hc_bytealign_S (in1[0], in1[1], 3);
284              out0[2] = hc_bytealign_S (in1[1], in1[2], 3);
285              out0[3] = hc_bytealign_S (in1[2], in1[3], 3);
286              out1[0] = hc_bytealign_S (in1[3],      0, 3);
287              out1[1] = 0;
288              out1[2] = 0;
289              out1[3] = 0;
290              break;
291    case 14:  out0[0] = hc_bytealign_S (in0[3], in1[0], 2);
292              out0[1] = hc_bytealign_S (in1[0], in1[1], 2);
293              out0[2] = hc_bytealign_S (in1[1], in1[2], 2);
294              out0[3] = hc_bytealign_S (in1[2], in1[3], 2);
295              out1[0] = hc_bytealign_S (in1[3],      0, 2);
296              out1[1] = 0;
297              out1[2] = 0;
298              out1[3] = 0;
299              break;
300    case 15:  out0[0] = hc_bytealign_S (in0[3], in1[0], 1);
301              out0[1] = hc_bytealign_S (in1[0], in1[1], 1);
302              out0[2] = hc_bytealign_S (in1[1], in1[2], 1);
303              out0[3] = hc_bytealign_S (in1[2], in1[3], 1);
304              out1[0] = hc_bytealign_S (in1[3],      0, 1);
305              out1[1] = 0;
306              out1[2] = 0;
307              out1[3] = 0;
308              break;
309    case 16:  out0[0] = in1[0];
310              out0[1] = in1[1];
311              out0[2] = in1[2];
312              out0[3] = in1[3];
313              out1[0] = 0;
314              out1[1] = 0;
315              out1[2] = 0;
316              out1[3] = 0;
317              break;
318    case 17:  out0[0] = hc_bytealign_S (in1[0], in1[1], 3);
319              out0[1] = hc_bytealign_S (in1[1], in1[2], 3);
320              out0[2] = hc_bytealign_S (in1[2], in1[3], 3);
321              out0[3] = hc_bytealign_S (in1[3],      0, 3);
322              out1[0] = 0;
323              out1[1] = 0;
324              out1[2] = 0;
325              out1[3] = 0;
326              break;
327    case 18:  out0[0] = hc_bytealign_S (in1[0], in1[1], 2);
328              out0[1] = hc_bytealign_S (in1[1], in1[2], 2);
329              out0[2] = hc_bytealign_S (in1[2], in1[3], 2);
330              out0[3] = hc_bytealign_S (in1[3],      0, 2);
331              out1[0] = 0;
332              out1[1] = 0;
333              out1[2] = 0;
334              out1[3] = 0;
335              break;
336    case 19:  out0[0] = hc_bytealign_S (in1[0], in1[1], 1);
337              out0[1] = hc_bytealign_S (in1[1], in1[2], 1);
338              out0[2] = hc_bytealign_S (in1[2], in1[3], 1);
339              out0[3] = hc_bytealign_S (in1[3],      0, 1);
340              out1[0] = 0;
341              out1[1] = 0;
342              out1[2] = 0;
343              out1[3] = 0;
344              break;
345    case 20:  out0[0] = in1[1];
346              out0[1] = in1[2];
347              out0[2] = in1[3];
348              out0[3] = 0;
349              out1[0] = 0;
350              out1[1] = 0;
351              out1[2] = 0;
352              out1[3] = 0;
353              break;
354    case 21:  out0[0] = hc_bytealign_S (in1[1], in1[2], 3);
355              out0[1] = hc_bytealign_S (in1[2], in1[3], 3);
356              out0[2] = hc_bytealign_S (in1[3],      0, 3);
357              out0[3] = 0;
358              out1[0] = 0;
359              out1[1] = 0;
360              out1[2] = 0;
361              out1[3] = 0;
362              break;
363    case 22:  out0[0] = hc_bytealign_S (in1[1], in1[2], 2);
364              out0[1] = hc_bytealign_S (in1[2], in1[3], 2);
365              out0[2] = hc_bytealign_S (in1[3],      0, 2);
366              out0[3] = 0;
367              out1[0] = 0;
368              out1[1] = 0;
369              out1[2] = 0;
370              out1[3] = 0;
371              break;
372    case 23:  out0[0] = hc_bytealign_S (in1[1], in1[2], 1);
373              out0[1] = hc_bytealign_S (in1[2], in1[3], 1);
374              out0[2] = hc_bytealign_S (in1[3],      0, 1);
375              out0[3] = 0;
376              out1[0] = 0;
377              out1[1] = 0;
378              out1[2] = 0;
379              out1[3] = 0;
380              break;
381    case 24:  out0[0] = in1[2];
382              out0[1] = in1[3];
383              out0[2] = 0;
384              out0[3] = 0;
385              out1[0] = 0;
386              out1[1] = 0;
387              out1[2] = 0;
388              out1[3] = 0;
389              break;
390    case 25:  out0[0] = hc_bytealign_S (in1[2], in1[3], 3);
391              out0[1] = hc_bytealign_S (in1[3],      0, 3);
392              out0[2] = 0;
393              out0[3] = 0;
394              out1[0] = 0;
395              out1[1] = 0;
396              out1[2] = 0;
397              out1[3] = 0;
398              break;
399    case 26:  out0[0] = hc_bytealign_S (in1[2], in1[3], 2);
400              out0[1] = hc_bytealign_S (in1[3],      0, 2);
401              out0[2] = 0;
402              out0[3] = 0;
403              out1[0] = 0;
404              out1[1] = 0;
405              out1[2] = 0;
406              out1[3] = 0;
407              break;
408    case 27:  out0[0] = hc_bytealign_S (in1[2], in1[3], 1);
409              out0[1] = hc_bytealign_S (in1[3],      0, 1);
410              out0[2] = 0;
411              out0[3] = 0;
412              out1[0] = 0;
413              out1[1] = 0;
414              out1[2] = 0;
415              out1[3] = 0;
416              break;
417    case 28:  out0[0] = in1[3];
418              out0[1] = 0;
419              out0[2] = 0;
420              out0[3] = 0;
421              out1[0] = 0;
422              out1[1] = 0;
423              out1[2] = 0;
424              out1[3] = 0;
425              break;
426    case 29:  out0[0] = hc_bytealign_S (in1[3],      0, 3);
427              out0[1] = 0;
428              out0[2] = 0;
429              out0[3] = 0;
430              out1[0] = 0;
431              out1[1] = 0;
432              out1[2] = 0;
433              out1[3] = 0;
434              break;
435    case 30:  out0[0] = hc_bytealign_S (in1[3],      0, 2);
436              out0[1] = 0;
437              out0[2] = 0;
438              out0[3] = 0;
439              out1[0] = 0;
440              out1[1] = 0;
441              out1[2] = 0;
442              out1[3] = 0;
443              break;
444    case 31:  out0[0] = hc_bytealign_S (in1[3],      0, 1);
445              out0[1] = 0;
446              out0[2] = 0;
447              out0[3] = 0;
448              out1[0] = 0;
449              out1[1] = 0;
450              out1[2] = 0;
451              out1[3] = 0;
452              break;
453  }
454}
455
456DECLSPEC void rshift_block_optimized_N (const u32 *in0, const u32 *in1, u32 *out0, u32 *out1, const u32 num)
457{
458  switch (num)
459  {
460    case  0:  out1[3] = in1[3];
461              out1[2] = in1[2];
462              out1[1] = in1[1];
463              out1[0] = in1[0];
464              out0[3] = in0[3];
465              out0[2] = in0[2];
466              out0[1] = in0[1];
467              out0[0] = in0[0];
468              break;
469    case  1:  out1[3] = hc_bytealign_S (in1[2], in1[3], 1);
470              out1[2] = hc_bytealign_S (in1[1], in1[2], 1);
471              out1[1] = hc_bytealign_S (in1[0], in1[1], 1);
472              out1[0] = hc_bytealign_S (in0[3], in1[0], 1);
473              out0[3] = hc_bytealign_S (in0[2], in0[3], 1);
474              out0[2] = hc_bytealign_S (in0[1], in0[2], 1);
475              out0[1] = hc_bytealign_S (in0[0], in0[1], 1);
476              out0[0] = hc_bytealign_S (     0, in0[0], 1);
477              break;
478    case  2:  out1[3] = hc_bytealign_S (in1[2], in1[3], 2);
479              out1[2] = hc_bytealign_S (in1[1], in1[2], 2);
480              out1[1] = hc_bytealign_S (in1[0], in1[1], 2);
481              out1[0] = hc_bytealign_S (in0[3], in1[0], 2);
482              out0[3] = hc_bytealign_S (in0[2], in0[3], 2);
483              out0[2] = hc_bytealign_S (in0[1], in0[2], 2);
484              out0[1] = hc_bytealign_S (in0[0], in0[1], 2);
485              out0[0] = hc_bytealign_S (     0, in0[0], 2);
486              break;
487    case  3:  out1[3] = hc_bytealign_S (in1[2], in1[3], 3);
488              out1[2] = hc_bytealign_S (in1[1], in1[2], 3);
489              out1[1] = hc_bytealign_S (in1[0], in1[1], 3);
490              out1[0] = hc_bytealign_S (in0[3], in1[0], 3);
491              out0[3] = hc_bytealign_S (in0[2], in0[3], 3);
492              out0[2] = hc_bytealign_S (in0[1], in0[2], 3);
493              out0[1] = hc_bytealign_S (in0[0], in0[1], 3);
494              out0[0] = hc_bytealign_S (     0, in0[0], 3);
495              break;
496    case  4:  out1[3] = in1[2];
497              out1[2] = in1[1];
498              out1[1] = in1[0];
499              out1[0] = in0[3];
500              out0[3] = in0[2];
501              out0[2] = in0[1];
502              out0[1] = in0[0];
503              out0[0] = 0;
504              break;
505    case  5:  out1[3] = hc_bytealign_S (in1[1], in1[2], 1);
506              out1[2] = hc_bytealign_S (in1[0], in1[1], 1);
507              out1[1] = hc_bytealign_S (in0[3], in1[0], 1);
508              out1[0] = hc_bytealign_S (in0[2], in0[3], 1);
509              out0[3] = hc_bytealign_S (in0[1], in0[2], 1);
510              out0[2] = hc_bytealign_S (in0[0], in0[1], 1);
511              out0[1] = hc_bytealign_S (     0, in0[0], 1);
512              out0[0] = 0;
513              break;
514    case  6:  out1[3] = hc_bytealign_S (in1[1], in1[2], 2);
515              out1[2] = hc_bytealign_S (in1[0], in1[1], 2);
516              out1[1] = hc_bytealign_S (in0[3], in1[0], 2);
517              out1[0] = hc_bytealign_S (in0[2], in0[3], 2);
518              out0[3] = hc_bytealign_S (in0[1], in0[2], 2);
519              out0[2] = hc_bytealign_S (in0[0], in0[1], 2);
520              out0[1] = hc_bytealign_S (     0, in0[0], 2);
521              out0[0] = 0;
522              break;
523    case  7:  out1[3] = hc_bytealign_S (in1[1], in1[2], 3);
524              out1[2] = hc_bytealign_S (in1[0], in1[1], 3);
525              out1[1] = hc_bytealign_S (in0[3], in1[0], 3);
526              out1[0] = hc_bytealign_S (in0[2], in0[3], 3);
527              out0[3] = hc_bytealign_S (in0[1], in0[2], 3);
528              out0[2] = hc_bytealign_S (in0[0], in0[1], 3);
529              out0[1] = hc_bytealign_S (     0, in0[0], 3);
530              out0[0] = 0;
531              break;
532    case  8:  out1[3] = in1[1];
533              out1[2] = in1[0];
534              out1[1] = in0[3];
535              out1[0] = in0[2];
536              out0[3] = in0[1];
537              out0[2] = in0[0];
538              out0[1] = 0;
539              out0[0] = 0;
540              break;
541    case  9:  out1[3] = hc_bytealign_S (in1[0], in1[1], 1);
542              out1[2] = hc_bytealign_S (in0[3], in1[0], 1);
543              out1[1] = hc_bytealign_S (in0[2], in0[3], 1);
544              out1[0] = hc_bytealign_S (in0[1], in0[2], 1);
545              out0[3] = hc_bytealign_S (in0[0], in0[1], 1);
546              out0[2] = hc_bytealign_S (     0, in0[0], 1);
547              out0[1] = 0;
548              out0[0] = 0;
549              break;
550    case 10:  out1[3] = hc_bytealign_S (in1[0], in1[1], 2);
551              out1[2] = hc_bytealign_S (in0[3], in1[0], 2);
552              out1[1] = hc_bytealign_S (in0[2], in0[3], 2);
553              out1[0] = hc_bytealign_S (in0[1], in0[2], 2);
554              out0[3] = hc_bytealign_S (in0[0], in0[1], 2);
555              out0[2] = hc_bytealign_S (     0, in0[0], 2);
556              out0[1] = 0;
557              out0[0] = 0;
558              break;
559    case 11:  out1[3] = hc_bytealign_S (in1[0], in1[1], 3);
560              out1[2] = hc_bytealign_S (in0[3], in1[0], 3);
561              out1[1] = hc_bytealign_S (in0[2], in0[3], 3);
562              out1[0] = hc_bytealign_S (in0[1], in0[2], 3);
563              out0[3] = hc_bytealign_S (in0[0], in0[1], 3);
564              out0[2] = hc_bytealign_S (     0, in0[0], 3);
565              out0[1] = 0;
566              out0[0] = 0;
567              break;
568    case 12:  out1[3] = in1[0];
569              out1[2] = in0[3];
570              out1[1] = in0[2];
571              out1[0] = in0[1];
572              out0[3] = in0[0];
573              out0[2] = 0;
574              out0[1] = 0;
575              out0[0] = 0;
576              break;
577    case 13:  out1[3] = hc_bytealign_S (in0[3], in1[0], 1);
578              out1[2] = hc_bytealign_S (in0[2], in0[3], 1);
579              out1[1] = hc_bytealign_S (in0[1], in0[2], 1);
580              out1[0] = hc_bytealign_S (in0[0], in0[1], 1);
581              out0[3] = hc_bytealign_S (     0, in0[0], 1);
582              out0[2] = 0;
583              out0[1] = 0;
584              out0[0] = 0;
585              break;
586    case 14:  out1[3] = hc_bytealign_S (in0[3], in1[0], 2);
587              out1[2] = hc_bytealign_S (in0[2], in0[3], 2);
588              out1[1] = hc_bytealign_S (in0[1], in0[2], 2);
589              out1[0] = hc_bytealign_S (in0[0], in0[1], 2);
590              out0[3] = hc_bytealign_S (     0, in0[0], 2);
591              out0[2] = 0;
592              out0[1] = 0;
593              out0[0] = 0;
594              break;
595    case 15:  out1[3] = hc_bytealign_S (in0[3], in1[0], 3);
596              out1[2] = hc_bytealign_S (in0[2], in0[3], 3);
597              out1[1] = hc_bytealign_S (in0[1], in0[2], 3);
598              out1[0] = hc_bytealign_S (in0[0], in0[1], 3);
599              out0[3] = hc_bytealign_S (     0, in0[0], 3);
600              out0[2] = 0;
601              out0[1] = 0;
602              out0[0] = 0;
603              break;
604    case 16:  out1[3] = in0[3];
605              out1[2] = in0[2];
606              out1[1] = in0[1];
607              out1[0] = in0[0];
608              out0[3] = 0;
609              out0[2] = 0;
610              out0[1] = 0;
611              out0[0] = 0;
612              break;
613    case 17:  out1[3] = hc_bytealign_S (in0[2], in0[3], 1);
614              out1[2] = hc_bytealign_S (in0[1], in0[2], 1);
615              out1[1] = hc_bytealign_S (in0[0], in0[1], 1);
616              out1[0] = hc_bytealign_S (     0, in0[0], 1);
617              out0[3] = 0;
618              out0[2] = 0;
619              out0[1] = 0;
620              out0[0] = 0;
621              break;
622    case 18:  out1[3] = hc_bytealign_S (in0[2], in0[3], 2);
623              out1[2] = hc_bytealign_S (in0[1], in0[2], 2);
624              out1[1] = hc_bytealign_S (in0[0], in0[1], 2);
625              out1[0] = hc_bytealign_S (     0, in0[0], 2);
626              out0[3] = 0;
627              out0[2] = 0;
628              out0[1] = 0;
629              out0[0] = 0;
630              break;
631    case 19:  out1[3] = hc_bytealign_S (in0[2], in0[3], 3);
632              out1[2] = hc_bytealign_S (in0[1], in0[2], 3);
633              out1[1] = hc_bytealign_S (in0[0], in0[1], 3);
634              out1[0] = hc_bytealign_S (     0, in0[0], 3);
635              out0[3] = 0;
636              out0[2] = 0;
637              out0[1] = 0;
638              out0[0] = 0;
639              break;
640    case 20:  out1[3] = in0[2];
641              out1[2] = in0[1];
642              out1[1] = in0[0];
643              out1[0] = 0;
644              out0[3] = 0;
645              out0[2] = 0;
646              out0[1] = 0;
647              out0[0] = 0;
648              break;
649    case 21:  out1[3] = hc_bytealign_S (in0[1], in0[2], 1);
650              out1[2] = hc_bytealign_S (in0[0], in0[1], 1);
651              out1[1] = hc_bytealign_S (     0, in0[0], 1);
652              out1[0] = 0;
653              out0[3] = 0;
654              out0[2] = 0;
655              out0[1] = 0;
656              out0[0] = 0;
657              break;
658    case 22:  out1[3] = hc_bytealign_S (in0[1], in0[2], 2);
659              out1[2] = hc_bytealign_S (in0[0], in0[1], 2);
660              out1[1] = hc_bytealign_S (     0, in0[0], 2);
661              out1[0] = 0;
662              out0[3] = 0;
663              out0[2] = 0;
664              out0[1] = 0;
665              out0[0] = 0;
666              break;
667    case 23:  out1[3] = hc_bytealign_S (in0[1], in0[2], 3);
668              out1[2] = hc_bytealign_S (in0[0], in0[1], 3);
669              out1[1] = hc_bytealign_S (     0, in0[0], 3);
670              out1[0] = 0;
671              out0[3] = 0;
672              out0[2] = 0;
673              out0[1] = 0;
674              out0[0] = 0;
675              break;
676    case 24:  out1[3] = in0[1];
677              out1[2] = in0[0];
678              out1[1] = 0;
679              out1[0] = 0;
680              out0[3] = 0;
681              out0[2] = 0;
682              out0[1] = 0;
683              out0[0] = 0;
684              break;
685    case 25:  out1[3] = hc_bytealign_S (in0[0], in0[1], 1);
686              out1[2] = hc_bytealign_S (     0, in0[0], 1);
687              out1[1] = 0;
688              out1[0] = 0;
689              out0[3] = 0;
690              out0[2] = 0;
691              out0[1] = 0;
692              out0[0] = 0;
693              break;
694    case 26:  out1[3] = hc_bytealign_S (in0[0], in0[1], 2);
695              out1[2] = hc_bytealign_S (     0, in0[0], 2);
696              out1[1] = 0;
697              out1[0] = 0;
698              out0[3] = 0;
699              out0[2] = 0;
700              out0[1] = 0;
701              out0[0] = 0;
702              break;
703    case 27:  out1[3] = hc_bytealign_S (in0[0], in0[1], 3);
704              out1[2] = hc_bytealign_S (     0, in0[0], 3);
705              out1[1] = 0;
706              out1[0] = 0;
707              out0[3] = 0;
708              out0[2] = 0;
709              out0[1] = 0;
710              out0[0] = 0;
711              break;
712    case 28:  out1[3] = in0[0];
713              out1[2] = 0;
714              out1[1] = 0;
715              out1[0] = 0;
716              out0[3] = 0;
717              out0[2] = 0;
718              out0[1] = 0;
719              out0[0] = 0;
720              break;
721    case 29:  out1[3] = hc_bytealign_S (     0, in0[0], 1);
722              out1[2] = 0;
723              out1[1] = 0;
724              out1[0] = 0;
725              out0[3] = 0;
726              out0[2] = 0;
727              out0[1] = 0;
728              out0[0] = 0;
729              break;
730    case 30:  out1[3] = hc_bytealign_S (     0, in0[0], 2);
731              out1[2] = 0;
732              out1[1] = 0;
733              out1[0] = 0;
734              out0[3] = 0;
735              out0[2] = 0;
736              out0[1] = 0;
737              out0[0] = 0;
738              break;
739    case 31:  out1[3] = hc_bytealign_S (     0, in0[0], 3);
740              out1[2] = 0;
741              out1[1] = 0;
742              out1[0] = 0;
743              out0[3] = 0;
744              out0[2] = 0;
745              out0[1] = 0;
746              out0[0] = 0;
747              break;
748  }
749}
750
751DECLSPEC void append_block1_optimized (const u32 offset, u32 *buf0, u32 *buf1, const u32 src_r0)
752{
753  // this version works with 1 byte append only
754  const u32 value = src_r0 & 0xff;
755
756  const u32 tmp = value <<  0
757                | value <<  8
758                | value << 16
759                | value << 24;
760
761  u32 v[4];
762
763  set_mark_1x4_S (v, offset);
764
765  const u32 offset16 = offset / 16;
766
767  append_helper_1x4_S (buf0, ((offset16 == 0) ? tmp : 0), v);
768  append_helper_1x4_S (buf1, ((offset16 == 1) ? tmp : 0), v);
769}
770
771DECLSPEC void append_block8_optimized (const u32 offset, u32 *buf0, u32 *buf1, const u32 *src_l0, const u32 *src_l1, const u32 *src_r0, const u32 *src_r1)
772{
773  u32 s0 = 0;
774  u32 s1 = 0;
775  u32 s2 = 0;
776  u32 s3 = 0;
777  u32 s4 = 0;
778  u32 s5 = 0;
779  u32 s6 = 0;
780  u32 s7 = 0;
781
782  const int offset_switch = offset / 4;
783
784  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC
785  const u32 src_r00 = src_r0[0];
786  const u32 src_r01 = src_r0[1];
787  const u32 src_r02 = src_r0[2];
788  const u32 src_r03 = src_r0[3];
789  const u32 src_r10 = src_r1[0];
790  const u32 src_r11 = src_r1[1];
791  const u32 src_r12 = src_r1[2];
792  const u32 src_r13 = src_r1[3];
793
794  switch (offset_switch)
795  {
796    case 0:
797      s7 = hc_bytealign_S (src_r12, src_r13, offset);
798      s6 = hc_bytealign_S (src_r11, src_r12, offset);
799      s5 = hc_bytealign_S (src_r10, src_r11, offset);
800      s4 = hc_bytealign_S (src_r03, src_r10, offset);
801      s3 = hc_bytealign_S (src_r02, src_r03, offset);
802      s2 = hc_bytealign_S (src_r01, src_r02, offset);
803      s1 = hc_bytealign_S (src_r00, src_r01, offset);
804      s0 = hc_bytealign_S (      0, src_r00, offset);
805      break;
806
807    case 1:
808      s7 = hc_bytealign_S (src_r11, src_r12, offset);
809      s6 = hc_bytealign_S (src_r10, src_r11, offset);
810      s5 = hc_bytealign_S (src_r03, src_r10, offset);
811      s4 = hc_bytealign_S (src_r02, src_r03, offset);
812      s3 = hc_bytealign_S (src_r01, src_r02, offset);
813      s2 = hc_bytealign_S (src_r00, src_r01, offset);
814      s1 = hc_bytealign_S (      0, src_r00, offset);
815      s0 = 0;
816      break;
817
818    case 2:
819      s7 = hc_bytealign_S (src_r10, src_r11, offset);
820      s6 = hc_bytealign_S (src_r03, src_r10, offset);
821      s5 = hc_bytealign_S (src_r02, src_r03, offset);
822      s4 = hc_bytealign_S (src_r01, src_r02, offset);
823      s3 = hc_bytealign_S (src_r00, src_r01, offset);
824      s2 = hc_bytealign_S (      0, src_r00, offset);
825      s1 = 0;
826      s0 = 0;
827      break;
828
829    case 3:
830      s7 = hc_bytealign_S (src_r03, src_r10, offset);
831      s6 = hc_bytealign_S (src_r02, src_r03, offset);
832      s5 = hc_bytealign_S (src_r01, src_r02, offset);
833      s4 = hc_bytealign_S (src_r00, src_r01, offset);
834      s3 = hc_bytealign_S (      0, src_r00, offset);
835      s2 = 0;
836      s1 = 0;
837      s0 = 0;
838
839      break;
840
841    case 4:
842      s7 = hc_bytealign_S (src_r02, src_r03, offset);
843      s6 = hc_bytealign_S (src_r01, src_r02, offset);
844      s5 = hc_bytealign_S (src_r00, src_r01, offset);
845      s4 = hc_bytealign_S (      0, src_r00, offset);
846      s3 = 0;
847      s2 = 0;
848      s1 = 0;
849      s0 = 0;
850      break;
851
852    case 5:
853      s7 = hc_bytealign_S (src_r01, src_r02, offset);
854      s6 = hc_bytealign_S (src_r00, src_r01, offset);
855      s5 = hc_bytealign_S (      0, src_r00, offset);
856      s4 = 0;
857      s3 = 0;
858      s2 = 0;
859      s1 = 0;
860      s0 = 0;
861      break;
862
863    case 6:
864      s7 = hc_bytealign_S (src_r00, src_r01, offset);
865      s6 = hc_bytealign_S (      0, src_r00, offset);
866      s5 = 0;
867      s4 = 0;
868      s3 = 0;
869      s2 = 0;
870      s1 = 0;
871      s0 = 0;
872      break;
873
874    case 7:
875      s7 = hc_bytealign_S (      0, src_r00, offset);
876      s6 = 0;
877      s5 = 0;
878      s4 = 0;
879      s3 = 0;
880      s2 = 0;
881      s1 = 0;
882      s0 = 0;
883      break;
884  }
885  #endif
886
887  #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV
888
889  const int offset_mod_4 = offset & 3;
890
891  const int offset_minus_4 = 4 - offset_mod_4;
892
893  #if defined IS_NV
894  const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
895  #endif
896
897  #if (defined IS_AMD || defined IS_HIP)
898  const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8));
899  #endif
900
901  const u32 src_r00 = src_r0[0];
902  const u32 src_r01 = src_r0[1];
903  const u32 src_r02 = src_r0[2];
904  const u32 src_r03 = src_r0[3];
905  const u32 src_r10 = src_r1[0];
906  const u32 src_r11 = src_r1[1];
907  const u32 src_r12 = src_r1[2];
908  const u32 src_r13 = src_r1[3];
909
910  switch (offset_switch)
911  {
912    case 0:
913      s7 = hc_byte_perm_S (src_r12, src_r13, selector);
914      s6 = hc_byte_perm_S (src_r11, src_r12, selector);
915      s5 = hc_byte_perm_S (src_r10, src_r11, selector);
916      s4 = hc_byte_perm_S (src_r03, src_r10, selector);
917      s3 = hc_byte_perm_S (src_r02, src_r03, selector);
918      s2 = hc_byte_perm_S (src_r01, src_r02, selector);
919      s1 = hc_byte_perm_S (src_r00, src_r01, selector);
920      s0 = hc_byte_perm_S (      0, src_r00, selector);
921      break;
922
923    case 1:
924      s7 = hc_byte_perm_S (src_r11, src_r12, selector);
925      s6 = hc_byte_perm_S (src_r10, src_r11, selector);
926      s5 = hc_byte_perm_S (src_r03, src_r10, selector);
927      s4 = hc_byte_perm_S (src_r02, src_r03, selector);
928      s3 = hc_byte_perm_S (src_r01, src_r02, selector);
929      s2 = hc_byte_perm_S (src_r00, src_r01, selector);
930      s1 = hc_byte_perm_S (      0, src_r00, selector);
931      s0 = 0;
932      break;
933
934    case 2:
935      s7 = hc_byte_perm_S (src_r10, src_r11, selector);
936      s6 = hc_byte_perm_S (src_r03, src_r10, selector);
937      s5 = hc_byte_perm_S (src_r02, src_r03, selector);
938      s4 = hc_byte_perm_S (src_r01, src_r02, selector);
939      s3 = hc_byte_perm_S (src_r00, src_r01, selector);
940      s2 = hc_byte_perm_S (      0, src_r00, selector);
941      s1 = 0;
942      s0 = 0;
943      break;
944
945    case 3:
946      s7 = hc_byte_perm_S (src_r03, src_r10, selector);
947      s6 = hc_byte_perm_S (src_r02, src_r03, selector);
948      s5 = hc_byte_perm_S (src_r01, src_r02, selector);
949      s4 = hc_byte_perm_S (src_r00, src_r01, selector);
950      s3 = hc_byte_perm_S (      0, src_r00, selector);
951      s2 = 0;
952      s1 = 0;
953      s0 = 0;
954
955      break;
956
957    case 4:
958      s7 = hc_byte_perm_S (src_r02, src_r03, selector);
959      s6 = hc_byte_perm_S (src_r01, src_r02, selector);
960      s5 = hc_byte_perm_S (src_r00, src_r01, selector);
961      s4 = hc_byte_perm_S (      0, src_r00, selector);
962      s3 = 0;
963      s2 = 0;
964      s1 = 0;
965      s0 = 0;
966      break;
967
968    case 5:
969      s7 = hc_byte_perm_S (src_r01, src_r02, selector);
970      s6 = hc_byte_perm_S (src_r00, src_r01, selector);
971      s5 = hc_byte_perm_S (      0, src_r00, selector);
972      s4 = 0;
973      s3 = 0;
974      s2 = 0;
975      s1 = 0;
976      s0 = 0;
977      break;
978
979    case 6:
980      s7 = hc_byte_perm_S (src_r00, src_r01, selector);
981      s6 = hc_byte_perm_S (      0, src_r00, selector);
982      s5 = 0;
983      s4 = 0;
984      s3 = 0;
985      s2 = 0;
986      s1 = 0;
987      s0 = 0;
988      break;
989
990    case 7:
991      s7 = hc_byte_perm_S (      0, src_r00, selector);
992      s6 = 0;
993      s5 = 0;
994      s4 = 0;
995      s3 = 0;
996      s2 = 0;
997      s1 = 0;
998      s0 = 0;
999      break;
1000  }
1001  #endif
1002
1003  buf0[0] = src_l0[0] | s0;
1004  buf0[1] = src_l0[1] | s1;
1005  buf0[2] = src_l0[2] | s2;
1006  buf0[3] = src_l0[3] | s3;
1007  buf1[0] = src_l1[0] | s4;
1008  buf1[1] = src_l1[1] | s5;
1009  buf1[2] = src_l1[2] | s6;
1010  buf1[3] = src_l1[3] | s7;
1011}
1012
1013DECLSPEC void reverse_block_optimized (u32 *in0, u32 *in1, u32 *out0, u32 *out1, const u32 len)
1014{
1015  rshift_block_optimized_N (in0, in1, out0, out1, 32 - len);
1016
1017  u32 tib40[4];
1018  u32 tib41[4];
1019
1020  tib40[0] = out1[3];
1021  tib40[1] = out1[2];
1022  tib40[2] = out1[1];
1023  tib40[3] = out1[0];
1024  tib41[0] = out0[3];
1025  tib41[1] = out0[2];
1026  tib41[2] = out0[1];
1027  tib41[3] = out0[0];
1028
1029  out0[0] = hc_swap32_S (tib40[0]);
1030  out0[1] = hc_swap32_S (tib40[1]);
1031  out0[2] = hc_swap32_S (tib40[2]);
1032  out0[3] = hc_swap32_S (tib40[3]);
1033  out1[0] = hc_swap32_S (tib41[0]);
1034  out1[1] = hc_swap32_S (tib41[1]);
1035  out1[2] = hc_swap32_S (tib41[2]);
1036  out1[3] = hc_swap32_S (tib41[3]);
1037}
1038
1039DECLSPEC void exchange_byte_optimized (u32 *buf, const int off_src, const int off_dst)
1040{
1041  u8 *ptr = (u8 *) buf;
1042
1043  const u8 tmp = ptr[off_src];
1044
1045  ptr[off_src] = ptr[off_dst];
1046  ptr[off_dst] = tmp;
1047}
1048
1049DECLSPEC u32 rule_op_mangle_lrest (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1050{
1051  u32 t;
1052
1053  t = buf0[0]; buf0[0] = t | generate_cmask_optimized (t);
1054  t = buf0[1]; buf0[1] = t | generate_cmask_optimized (t);
1055  t = buf0[2]; buf0[2] = t | generate_cmask_optimized (t);
1056  t = buf0[3]; buf0[3] = t | generate_cmask_optimized (t);
1057  t = buf1[0]; buf1[0] = t | generate_cmask_optimized (t);
1058  t = buf1[1]; buf1[1] = t | generate_cmask_optimized (t);
1059  t = buf1[2]; buf1[2] = t | generate_cmask_optimized (t);
1060  t = buf1[3]; buf1[3] = t | generate_cmask_optimized (t);
1061
1062  return in_len;
1063}
1064
1065DECLSPEC u32 rule_op_mangle_urest (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1066{
1067  u32 t;
1068
1069  t = buf0[0]; buf0[0] = t & ~(generate_cmask_optimized (t));
1070  t = buf0[1]; buf0[1] = t & ~(generate_cmask_optimized (t));
1071  t = buf0[2]; buf0[2] = t & ~(generate_cmask_optimized (t));
1072  t = buf0[3]; buf0[3] = t & ~(generate_cmask_optimized (t));
1073  t = buf1[0]; buf1[0] = t & ~(generate_cmask_optimized (t));
1074  t = buf1[1]; buf1[1] = t & ~(generate_cmask_optimized (t));
1075  t = buf1[2]; buf1[2] = t & ~(generate_cmask_optimized (t));
1076  t = buf1[3]; buf1[3] = t & ~(generate_cmask_optimized (t));
1077
1078  return in_len;
1079}
1080
1081DECLSPEC u32 rule_op_mangle_lrest_ufirst (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1082{
1083  u32 t;
1084
1085  t = buf0[0]; buf0[0] = t | generate_cmask_optimized (t);
1086  t = buf0[1]; buf0[1] = t | generate_cmask_optimized (t);
1087  t = buf0[2]; buf0[2] = t | generate_cmask_optimized (t);
1088  t = buf0[3]; buf0[3] = t | generate_cmask_optimized (t);
1089  t = buf1[0]; buf1[0] = t | generate_cmask_optimized (t);
1090  t = buf1[1]; buf1[1] = t | generate_cmask_optimized (t);
1091  t = buf1[2]; buf1[2] = t | generate_cmask_optimized (t);
1092  t = buf1[3]; buf1[3] = t | generate_cmask_optimized (t);
1093
1094  t = buf0[0]; buf0[0] = t & ~(0x00000020 & generate_cmask_optimized (t));
1095
1096  return in_len;
1097}
1098
1099DECLSPEC u32 rule_op_mangle_urest_lfirst (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1100{
1101  u32 t;
1102
1103  t = buf0[0]; buf0[0] = t & ~(generate_cmask_optimized (t));
1104  t = buf0[1]; buf0[1] = t & ~(generate_cmask_optimized (t));
1105  t = buf0[2]; buf0[2] = t & ~(generate_cmask_optimized (t));
1106  t = buf0[3]; buf0[3] = t & ~(generate_cmask_optimized (t));
1107  t = buf1[0]; buf1[0] = t & ~(generate_cmask_optimized (t));
1108  t = buf1[1]; buf1[1] = t & ~(generate_cmask_optimized (t));
1109  t = buf1[2]; buf1[2] = t & ~(generate_cmask_optimized (t));
1110  t = buf1[3]; buf1[3] = t & ~(generate_cmask_optimized (t));
1111
1112  t = buf0[0]; buf0[0] = t | (0x00000020 & generate_cmask_optimized (t));
1113
1114  return in_len;
1115}
1116
1117DECLSPEC u32 rule_op_mangle_trest (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1118{
1119  u32 t;
1120
1121  t = buf0[0]; buf0[0] = t ^ generate_cmask_optimized (t);
1122  t = buf0[1]; buf0[1] = t ^ generate_cmask_optimized (t);
1123  t = buf0[2]; buf0[2] = t ^ generate_cmask_optimized (t);
1124  t = buf0[3]; buf0[3] = t ^ generate_cmask_optimized (t);
1125  t = buf1[0]; buf1[0] = t ^ generate_cmask_optimized (t);
1126  t = buf1[1]; buf1[1] = t ^ generate_cmask_optimized (t);
1127  t = buf1[2]; buf1[2] = t ^ generate_cmask_optimized (t);
1128  t = buf1[3]; buf1[3] = t ^ generate_cmask_optimized (t);
1129
1130  return in_len;
1131}
1132
1133DECLSPEC u32 rule_op_mangle_toggle_at (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1134{
1135  if (p0 >= in_len) return in_len;
1136
1137  u32 t[8];
1138
1139  t[0] = buf0[0];
1140  t[1] = buf0[1];
1141  t[2] = buf0[2];
1142  t[3] = buf0[3];
1143  t[4] = buf1[0];
1144  t[5] = buf1[1];
1145  t[6] = buf1[2];
1146  t[7] = buf1[3];
1147
1148  const u32 tmp = t[p0 / 4];
1149
1150  const u32 m = 0x20u << ((p0 & 3) * 8);
1151
1152  t[p0 / 4] = tmp ^ (m & generate_cmask_optimized (tmp));
1153
1154  buf0[0] = t[0];
1155  buf0[1] = t[1];
1156  buf0[2] = t[2];
1157  buf0[3] = t[3];
1158  buf1[0] = t[4];
1159  buf1[1] = t[5];
1160  buf1[2] = t[6];
1161  buf1[3] = t[7];
1162
1163  return (in_len);
1164}
1165
1166DECLSPEC u32 rule_op_mangle_toggle_at_sep (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1167{
1168  if (in_len == 0) return in_len;
1169
1170  u32 r0 = search_on_register (buf0[0], p1);
1171  u32 r1 = search_on_register (buf0[1], p1);
1172  u32 r2 = search_on_register (buf0[2], p1);
1173  u32 r3 = search_on_register (buf0[3], p1);
1174  u32 r4 = search_on_register (buf1[0], p1);
1175  u32 r5 = search_on_register (buf1[1], p1);
1176  u32 r6 = search_on_register (buf1[2], p1);
1177  u32 r7 = search_on_register (buf1[3], p1);
1178
1179  const u32 rn = (r0 <<  0)
1180               | (r1 <<  4)
1181               | (r2 <<  8)
1182               | (r3 << 12)
1183               | (r4 << 16)
1184               | (r5 << 20)
1185               | (r6 << 24)
1186               | (r7 << 28);
1187
1188  if (rn == 0) return in_len;
1189
1190  u32 occurence = 0;
1191
1192  u32 ro = 0;
1193
1194  #ifdef _unroll
1195  #pragma unroll
1196  #endif
1197  for (int i = 0; i < 32; i++)
1198  {
1199    if ((rn >> i) & 1)
1200    {
1201      if (occurence == p0)
1202      {
1203        ro = 1 << i;
1204
1205        break;
1206      }
1207
1208      occurence++;
1209    }
1210  }
1211
1212  r0 = (ro >>  0) & 15;
1213  r1 = (ro >>  4) & 15;
1214  r2 = (ro >>  8) & 15;
1215  r3 = (ro >> 12) & 15;
1216  r4 = (ro >> 16) & 15;
1217  r5 = (ro >> 20) & 15;
1218  r6 = (ro >> 24) & 15;
1219  r7 = (ro >> 28) & 15;
1220
1221  r0 <<= 1;
1222  r1 <<= 1; r1 |= r0 >> 4;
1223  r2 <<= 1; r2 |= r1 >> 4;
1224  r3 <<= 1; r3 |= r2 >> 4;
1225  r4 <<= 1; r4 |= r3 >> 4;
1226  r5 <<= 1; r5 |= r4 >> 4;
1227  r6 <<= 1; r6 |= r5 >> 4;
1228  r7 <<= 1; r7 |= r6 >> 4;
1229
1230  buf0[0] = toggle_on_register (buf0[0], r0);
1231  buf0[1] = toggle_on_register (buf0[1], r1);
1232  buf0[2] = toggle_on_register (buf0[2], r2);
1233  buf0[3] = toggle_on_register (buf0[3], r3);
1234  buf1[0] = toggle_on_register (buf1[0], r4);
1235  buf1[1] = toggle_on_register (buf1[1], r5);
1236  buf1[2] = toggle_on_register (buf1[2], r6);
1237  buf1[3] = toggle_on_register (buf1[3], r7);
1238
1239  return in_len;
1240}
1241
1242DECLSPEC u32 rule_op_mangle_reverse (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1243{
1244  reverse_block_optimized (buf0, buf1, buf0, buf1, in_len);
1245
1246  return in_len;
1247}
1248
1249DECLSPEC u32 rule_op_mangle_dupeword (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1250{
1251  if ((in_len + in_len) >= 32) return in_len;
1252
1253  u32 out_len = in_len;
1254
1255  append_block8_optimized (out_len, buf0, buf1, buf0, buf1, buf0, buf1);
1256
1257  out_len += in_len;
1258
1259  return out_len;
1260}
1261
1262DECLSPEC u32 rule_op_mangle_dupeword_times (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1263{
1264  if (((in_len * p0) + in_len) >= 32) return in_len;
1265
1266  u32 out_len = in_len;
1267
1268  u32 tib40[4];
1269  u32 tib41[4];
1270
1271  tib40[0] = buf0[0];
1272  tib40[1] = buf0[1];
1273  tib40[2] = buf0[2];
1274  tib40[3] = buf0[3];
1275  tib41[0] = buf1[0];
1276  tib41[1] = buf1[1];
1277  tib41[2] = buf1[2];
1278  tib41[3] = buf1[3];
1279
1280  for (u32 i = 0; i < p0; i++)
1281  {
1282    append_block8_optimized (out_len, buf0, buf1, buf0, buf1, tib40, tib41);
1283
1284    out_len += in_len;
1285  }
1286
1287  return out_len;
1288}
1289
1290DECLSPEC u32 rule_op_mangle_reflect (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1291{
1292  if ((in_len + in_len) >= 32) return in_len;
1293
1294  u32 out_len = in_len;
1295
1296  u32 tib40[4] = { 0 };
1297  u32 tib41[4] = { 0 };
1298
1299  reverse_block_optimized (buf0, buf1, tib40, tib41, out_len);
1300
1301  append_block8_optimized (out_len, buf0, buf1, buf0, buf1, tib40, tib41);
1302
1303  out_len += in_len;
1304
1305  return out_len;
1306}
1307
1308DECLSPEC u32 rule_op_mangle_append (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1309{
1310  if ((in_len + 1) >= 32) return in_len;
1311
1312  u32 out_len = in_len;
1313
1314  append_block1_optimized (out_len, buf0, buf1, p0);
1315
1316  out_len++;
1317
1318  return out_len;
1319}
1320
1321DECLSPEC u32 rule_op_mangle_prepend (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1322{
1323  if ((in_len + 1) >= 32) return in_len;
1324
1325  u32 out_len = in_len;
1326
1327  rshift_block_optimized (buf0, buf1, buf0, buf1);
1328
1329  buf0[0] = buf0[0] | p0;
1330
1331  out_len++;
1332
1333  return out_len;
1334}
1335
1336DECLSPEC u32 rule_op_mangle_rotate_left (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1337{
1338  if (in_len == 0) return in_len;
1339
1340  const u32 in_len1 = in_len - 1;
1341
1342  const u32 tmp = buf0[0];
1343
1344  lshift_block_optimized (buf0, buf1, buf0, buf1);
1345
1346  append_block1_optimized (in_len1, buf0, buf1, tmp);
1347
1348  return in_len;
1349}
1350
1351DECLSPEC u32 rule_op_mangle_rotate_right (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1352{
1353  if (in_len == 0) return in_len;
1354
1355  const u32 in_len1 = in_len - 1;
1356
1357  const u32 sh = (in_len1 & 3) * 8;
1358
1359  u32 tmp = 0;
1360
1361  u32 v[4];
1362
1363  set_mark_1x4_S (v, in_len1);
1364
1365  switch (in_len1 / 16)
1366  {
1367    case 0:
1368      tmp |= buf0[0] & v[0];
1369      tmp |= buf0[1] & v[1];
1370      tmp |= buf0[2] & v[2];
1371      tmp |= buf0[3] & v[3];
1372      break;
1373
1374    case 1:
1375      tmp |= buf1[0] & v[0];
1376      tmp |= buf1[1] & v[1];
1377      tmp |= buf1[2] & v[2];
1378      tmp |= buf1[3] & v[3];
1379      break;
1380  }
1381
1382  tmp = (tmp >> sh) & 0xff;
1383
1384  rshift_block_optimized (buf0, buf1, buf0, buf1);
1385
1386  buf0[0] |= tmp;
1387
1388  truncate_right_optimized (buf0, buf1, in_len);
1389
1390  return in_len;
1391}
1392
1393DECLSPEC u32 rule_op_mangle_delete_first (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1394{
1395  if (in_len == 0) return in_len;
1396
1397  const u32 in_len1 = in_len - 1;
1398
1399  lshift_block_optimized (buf0, buf1, buf0, buf1);
1400
1401  return in_len1;
1402}
1403
1404DECLSPEC u32 rule_op_mangle_delete_last (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1405{
1406  if (in_len == 0) return in_len;
1407
1408  const u32 in_len1 = in_len - 1;
1409
1410  const u32 mask = (1 << ((in_len1 & 3) * 8)) - 1;
1411
1412  buf0[0] &=                     (in_len1 <  4)  ? mask : 0xffffffff;
1413  buf0[1] &= ((in_len1 >=  4) && (in_len1 <  8)) ? mask : 0xffffffff;
1414  buf0[2] &= ((in_len1 >=  8) && (in_len1 < 12)) ? mask : 0xffffffff;
1415  buf0[3] &= ((in_len1 >= 12) && (in_len1 < 16)) ? mask : 0xffffffff;
1416  buf1[0] &= ((in_len1 >= 16) && (in_len1 < 20)) ? mask : 0xffffffff;
1417  buf1[1] &= ((in_len1 >= 20) && (in_len1 < 24)) ? mask : 0xffffffff;
1418  buf1[2] &= ((in_len1 >= 24) && (in_len1 < 28)) ? mask : 0xffffffff;
1419  buf1[3] &=  (in_len1 >= 28)                    ? mask : 0xffffffff;
1420
1421  return in_len1;
1422}
1423
1424DECLSPEC u32 rule_op_mangle_delete_at (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1425{
1426  if (p0 >= in_len) return in_len;
1427
1428  u32 out_len = in_len;
1429
1430  u32 tib40[4];
1431  u32 tib41[4];
1432
1433  lshift_block_optimized (buf0, buf1, tib40, tib41);
1434
1435  const u32 ml = (1 << ((p0 & 3) * 8)) - 1;
1436  const u32 mr = ~ml;
1437
1438  const int p0_switch = p0 / 4;
1439
1440  switch (p0_switch)
1441  {
1442    case  0:  buf0[0] =  (buf0[0] & ml)
1443                      | (tib40[0] & mr);
1444              buf0[1] =  tib40[1];
1445              buf0[2] =  tib40[2];
1446              buf0[3] =  tib40[3];
1447              buf1[0] =  tib41[0];
1448              buf1[1] =  tib41[1];
1449              buf1[2] =  tib41[2];
1450              buf1[3] =  tib41[3];
1451              break;
1452    case  1:  buf0[1] =  (buf0[1] & ml)
1453                      | (tib40[1] & mr);
1454              buf0[2] =  tib40[2];
1455              buf0[3] =  tib40[3];
1456              buf1[0] =  tib41[0];
1457              buf1[1] =  tib41[1];
1458              buf1[2] =  tib41[2];
1459              buf1[3] =  tib41[3];
1460              break;
1461    case  2:  buf0[2] =  (buf0[2] & ml)
1462                      | (tib40[2] & mr);
1463              buf0[3] =  tib40[3];
1464              buf1[0] =  tib41[0];
1465              buf1[1] =  tib41[1];
1466              buf1[2] =  tib41[2];
1467              buf1[3] =  tib41[3];
1468              break;
1469    case  3:  buf0[3] =  (buf0[3] & ml)
1470                      | (tib40[3] & mr);
1471              buf1[0] =  tib41[0];
1472              buf1[1] =  tib41[1];
1473              buf1[2] =  tib41[2];
1474              buf1[3] =  tib41[3];
1475              break;
1476    case  4:  buf1[0] =  (buf1[0] & ml)
1477                      | (tib41[0] & mr);
1478              buf1[1] =  tib41[1];
1479              buf1[2] =  tib41[2];
1480              buf1[3] =  tib41[3];
1481              break;
1482    case  5:  buf1[1] =  (buf1[1] & ml)
1483                      | (tib41[1] & mr);
1484              buf1[2] =  tib41[2];
1485              buf1[3] =  tib41[3];
1486              break;
1487    case  6:  buf1[2] =  (buf1[2] & ml)
1488                      | (tib41[2] & mr);
1489              buf1[3] =  tib41[3];
1490              break;
1491    case  7:  buf1[3] =  (buf1[3] & ml)
1492                      | (tib41[3] & mr);
1493              break;
1494  }
1495
1496  out_len--;
1497
1498  return out_len;
1499}
1500
1501DECLSPEC u32 rule_op_mangle_extract (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1502{
1503  if (p0 >= in_len) return in_len;
1504
1505  if ((p0 + p1) > in_len) return in_len;
1506
1507  u32 out_len = p1;
1508
1509  lshift_block_optimized_N (buf0, buf1, buf0, buf1, p0);
1510
1511  truncate_right_optimized (buf0, buf1, out_len);
1512
1513  return out_len;
1514}
1515
1516DECLSPEC u32 rule_op_mangle_omit (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1517{
1518  if (p0 >= in_len) return in_len;
1519
1520  if ((p0 + p1) > in_len) return in_len;
1521
1522  u32 out_len = in_len;
1523
1524  u32 tib40[4];
1525  u32 tib41[4];
1526
1527  tib40[0] = 0;
1528  tib40[1] = 0;
1529  tib40[2] = 0;
1530  tib40[3] = 0;
1531  tib41[0] = 0;
1532  tib41[1] = 0;
1533  tib41[2] = 0;
1534  tib41[3] = 0;
1535
1536  lshift_block_optimized_N (buf0, buf1, tib40, tib41, p1);
1537
1538  const u32 ml = (1 << ((p0 & 3) * 8)) - 1;
1539  const u32 mr = ~ml;
1540
1541  const int p0_switch = p0 / 4;
1542
1543  switch (p0_switch)
1544  {
1545    case  0:  buf0[0] =  (buf0[0] & ml)
1546                      | (tib40[0] & mr);
1547              buf0[1] =  tib40[1];
1548              buf0[2] =  tib40[2];
1549              buf0[3] =  tib40[3];
1550              buf1[0] =  tib41[0];
1551              buf1[1] =  tib41[1];
1552              buf1[2] =  tib41[2];
1553              buf1[3] =  tib41[3];
1554              break;
1555    case  1:  buf0[1] =  (buf0[1] & ml)
1556                      | (tib40[1] & mr);
1557              buf0[2] =  tib40[2];
1558              buf0[3] =  tib40[3];
1559              buf1[0] =  tib41[0];
1560              buf1[1] =  tib41[1];
1561              buf1[2] =  tib41[2];
1562              buf1[3] =  tib41[3];
1563              break;
1564    case  2:  buf0[2] =  (buf0[2] & ml)
1565                      | (tib40[2] & mr);
1566              buf0[3] =  tib40[3];
1567              buf1[0] =  tib41[0];
1568              buf1[1] =  tib41[1];
1569              buf1[2] =  tib41[2];
1570              buf1[3] =  tib41[3];
1571              break;
1572    case  3:  buf0[3] =  (buf0[3] & ml)
1573                      | (tib40[3] & mr);
1574              buf1[0] =  tib41[0];
1575              buf1[1] =  tib41[1];
1576              buf1[2] =  tib41[2];
1577              buf1[3] =  tib41[3];
1578              break;
1579    case  4:  buf1[0] =  (buf1[0] & ml)
1580                      | (tib41[0] & mr);
1581              buf1[1] =  tib41[1];
1582              buf1[2] =  tib41[2];
1583              buf1[3] =  tib41[3];
1584              break;
1585    case  5:  buf1[1] =  (buf1[1] & ml)
1586                      | (tib41[1] & mr);
1587              buf1[2] =  tib41[2];
1588              buf1[3] =  tib41[3];
1589              break;
1590    case  6:  buf1[2] =  (buf1[2] & ml)
1591                      | (tib41[2] & mr);
1592              buf1[3] =  tib41[3];
1593              break;
1594    case  7:  buf1[3] =  (buf1[3] & ml)
1595                      | (tib41[3] & mr);
1596              break;
1597  }
1598
1599  out_len -= p1;
1600
1601  return out_len;
1602}
1603
1604DECLSPEC u32 rule_op_mangle_insert (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1605{
1606  if (p0 > in_len) return in_len;
1607
1608  if ((in_len + 1) >= 32) return in_len;
1609
1610  u32 out_len = in_len;
1611
1612  u32 tib40[4];
1613  u32 tib41[4];
1614
1615  rshift_block_optimized (buf0, buf1, tib40, tib41);
1616
1617  const u32 p1n = p1 << ((p0 & 3) * 8);
1618
1619  const u32 ml = (1 << ((p0 & 3) * 8)) - 1;
1620
1621  const u32 mr = 0xffffff00 << ((p0 & 3) * 8);
1622
1623  const int p0_switch = p0 / 4;
1624
1625  switch (p0_switch)
1626  {
1627    case  0:  buf0[0] =  (buf0[0] & ml) | p1n | (tib40[0] & mr);
1628              buf0[1] =  tib40[1];
1629              buf0[2] =  tib40[2];
1630              buf0[3] =  tib40[3];
1631              buf1[0] =  tib41[0];
1632              buf1[1] =  tib41[1];
1633              buf1[2] =  tib41[2];
1634              buf1[3] =  tib41[3];
1635              break;
1636    case  1:  buf0[1] =  (buf0[1] & ml) | p1n | (tib40[1] & mr);
1637              buf0[2] =  tib40[2];
1638              buf0[3] =  tib40[3];
1639              buf1[0] =  tib41[0];
1640              buf1[1] =  tib41[1];
1641              buf1[2] =  tib41[2];
1642              buf1[3] =  tib41[3];
1643              break;
1644    case  2:  buf0[2] =  (buf0[2] & ml) | p1n | (tib40[2] & mr);
1645              buf0[3] =  tib40[3];
1646              buf1[0] =  tib41[0];
1647              buf1[1] =  tib41[1];
1648              buf1[2] =  tib41[2];
1649              buf1[3] =  tib41[3];
1650              break;
1651    case  3:  buf0[3] =  (buf0[3] & ml) | p1n | (tib40[3] & mr);
1652              buf1[0] =  tib41[0];
1653              buf1[1] =  tib41[1];
1654              buf1[2] =  tib41[2];
1655              buf1[3] =  tib41[3];
1656              break;
1657    case  4:  buf1[0] =  (buf1[0] & ml) | p1n | (tib41[0] & mr);
1658              buf1[1] =  tib41[1];
1659              buf1[2] =  tib41[2];
1660              buf1[3] =  tib41[3];
1661              break;
1662    case  5:  buf1[1] =  (buf1[1] & ml) | p1n | (tib41[1] & mr);
1663              buf1[2] =  tib41[2];
1664              buf1[3] =  tib41[3];
1665              break;
1666    case  6:  buf1[2] =  (buf1[2] & ml) | p1n | (tib41[2] & mr);
1667              buf1[3] =  tib41[3];
1668              break;
1669    case  7:  buf1[3] =  (buf1[3] & ml) | p1n | (tib41[3] & mr);
1670              break;
1671  }
1672
1673  out_len++;
1674
1675  return out_len;
1676}
1677
1678DECLSPEC u32 rule_op_mangle_overstrike (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1679{
1680  if (p0 >= in_len) return in_len;
1681
1682  const u32 p1n = p1 << ((p0 & 3) * 8);
1683
1684  const u32 m = ~(0xffu << ((p0 & 3) * 8));
1685
1686  u32 t[8];
1687
1688  t[0] = buf0[0];
1689  t[1] = buf0[1];
1690  t[2] = buf0[2];
1691  t[3] = buf0[3];
1692  t[4] = buf1[0];
1693  t[5] = buf1[1];
1694  t[6] = buf1[2];
1695  t[7] = buf1[3];
1696
1697  const u32 tmp = t[p0 / 4];
1698
1699  t[p0 / 4] = (tmp & m) | p1n;
1700
1701  buf0[0] = t[0];
1702  buf0[1] = t[1];
1703  buf0[2] = t[2];
1704  buf0[3] = t[3];
1705  buf1[0] = t[4];
1706  buf1[1] = t[5];
1707  buf1[2] = t[6];
1708  buf1[3] = t[7];
1709
1710  return in_len;
1711}
1712
1713DECLSPEC u32 rule_op_mangle_truncate_at (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1714{
1715  if (p0 >= in_len) return in_len;
1716
1717  truncate_right_optimized (buf0, buf1, p0);
1718
1719  return p0;
1720}
1721
1722DECLSPEC u32 search_on_register (const u32 in, const u32 p0)
1723{
1724  u32 r = 0;
1725
1726  if (hc_bfe_S (in,  0, 8) == p0) r |= 1;
1727  if (hc_bfe_S (in,  8, 8) == p0) r |= 2;
1728  if (hc_bfe_S (in, 16, 8) == p0) r |= 4;
1729  if (hc_bfe_S (in, 24, 8) == p0) r |= 8;
1730
1731  return r;
1732}
1733
1734DECLSPEC u32 replace_on_register (const u32 in, const u32 r, const u32 p1)
1735{
1736  u32 out = in;
1737
1738  if (r & 1) out = (out & 0xffffff00) | (p1 <<  0);
1739  if (r & 2) out = (out & 0xffff00ff) | (p1 <<  8);
1740  if (r & 4) out = (out & 0xff00ffff) | (p1 << 16);
1741  if (r & 8) out = (out & 0x00ffffff) | (p1 << 24);
1742
1743  return out;
1744}
1745
1746DECLSPEC u32 rule_op_mangle_replace (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1747{
1748  const u32 r0 = search_on_register (buf0[0], p0);
1749  const u32 r1 = search_on_register (buf0[1], p0);
1750  const u32 r2 = search_on_register (buf0[2], p0);
1751  const u32 r3 = search_on_register (buf0[3], p0);
1752  const u32 r4 = search_on_register (buf1[0], p0);
1753  const u32 r5 = search_on_register (buf1[1], p0);
1754  const u32 r6 = search_on_register (buf1[2], p0);
1755  const u32 r7 = search_on_register (buf1[3], p0);
1756
1757  const u32 rn = r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7;
1758
1759  if (rn == 0) return in_len;
1760
1761  buf0[0] = replace_on_register (buf0[0], r0, p1);
1762  buf0[1] = replace_on_register (buf0[1], r1, p1);
1763  buf0[2] = replace_on_register (buf0[2], r2, p1);
1764  buf0[3] = replace_on_register (buf0[3], r3, p1);
1765  buf1[0] = replace_on_register (buf1[0], r4, p1);
1766  buf1[1] = replace_on_register (buf1[1], r5, p1);
1767  buf1[2] = replace_on_register (buf1[2], r6, p1);
1768  buf1[3] = replace_on_register (buf1[3], r7, p1);
1769
1770  return in_len;
1771}
1772
1773DECLSPEC u32 rule_op_mangle_purgechar (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1774{
1775  const u32 r0 = search_on_register (buf0[0], p0);
1776  const u32 r1 = search_on_register (buf0[1], p0);
1777  const u32 r2 = search_on_register (buf0[2], p0);
1778  const u32 r3 = search_on_register (buf0[3], p0);
1779  const u32 r4 = search_on_register (buf1[0], p0);
1780  const u32 r5 = search_on_register (buf1[1], p0);
1781  const u32 r6 = search_on_register (buf1[2], p0);
1782  const u32 r7 = search_on_register (buf1[3], p0);
1783
1784  const u32 rn = r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7;
1785
1786  if (rn == 0) return in_len;
1787
1788  u32 out_len = 0;
1789
1790  u32 buf_in[8];
1791
1792  buf_in[0] = buf0[0];
1793  buf_in[1] = buf0[1];
1794  buf_in[2] = buf0[2];
1795  buf_in[3] = buf0[3];
1796  buf_in[4] = buf1[0];
1797  buf_in[5] = buf1[1];
1798  buf_in[6] = buf1[2];
1799  buf_in[7] = buf1[3];
1800
1801  u32 buf_out[8] = { 0 };
1802
1803  u8 *in  = (u8 *) buf_in;
1804  u8 *out = (u8 *) buf_out;
1805
1806  for (u32 pos = 0; pos < in_len; pos++)
1807  {
1808    if (in[pos] == (u8) p0) continue;
1809
1810    out[out_len] = in[pos];
1811
1812    out_len++;
1813  }
1814
1815  buf0[0] = buf_out[0];
1816  buf0[1] = buf_out[1];
1817  buf0[2] = buf_out[2];
1818  buf0[3] = buf_out[3];
1819  buf1[0] = buf_out[4];
1820  buf1[1] = buf_out[5];
1821  buf1[2] = buf_out[6];
1822  buf1[3] = buf_out[7];
1823
1824  return out_len;
1825}
1826
1827DECLSPEC u32 rule_op_mangle_dupechar_first (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1828{
1829  if ( in_len       ==  0) return in_len;
1830  if ((in_len + p0) >= 32) return in_len;
1831
1832  u32 out_len = in_len;
1833
1834  const u32 tmp = buf0[0] & 0xFF;
1835
1836  const u32 tmp32 = tmp <<  0
1837                  | tmp <<  8
1838                  | tmp << 16
1839                  | tmp << 24;
1840
1841  rshift_block_optimized_N (buf0, buf1, buf0, buf1, p0);
1842
1843  u32 t0[4] = { tmp32, tmp32, tmp32, tmp32 };
1844  u32 t1[4] = { tmp32, tmp32, tmp32, tmp32 };
1845
1846  truncate_right_optimized (t0, t1, p0);
1847
1848  buf0[0] |= t0[0];
1849  buf0[1] |= t0[1];
1850  buf0[2] |= t0[2];
1851  buf0[3] |= t0[3];
1852  buf1[0] |= t1[0];
1853  buf1[1] |= t1[1];
1854  buf1[2] |= t1[2];
1855  buf1[3] |= t1[3];
1856
1857  out_len += p0;
1858
1859  return out_len;
1860}
1861
1862DECLSPEC u32 rule_op_mangle_dupechar_last (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1863{
1864  if ( in_len       ==  0) return in_len;
1865  if ((in_len + p0) >= 32) return in_len;
1866
1867  const u32 in_len1 = in_len - 1;
1868
1869  const u32 sh = (in_len1 & 3) * 8;
1870
1871  u32 tmp = 0;
1872
1873  u32 v[4];
1874
1875  set_mark_1x4_S (v, in_len1);
1876
1877  switch (in_len1 / 16)
1878  {
1879    case 0:
1880      tmp |= buf0[0] & v[0];
1881      tmp |= buf0[1] & v[1];
1882      tmp |= buf0[2] & v[2];
1883      tmp |= buf0[3] & v[3];
1884      break;
1885
1886    case 1:
1887      tmp |= buf1[0] & v[0];
1888      tmp |= buf1[1] & v[1];
1889      tmp |= buf1[2] & v[2];
1890      tmp |= buf1[3] & v[3];
1891      break;
1892  }
1893
1894  tmp = (tmp >> sh) & 0xff;
1895
1896  u32 out_len = in_len;
1897
1898  for (u32 i = 0; i < p0; i++)
1899  {
1900    append_block1_optimized (out_len, buf0, buf1, tmp);
1901
1902    out_len++;
1903  }
1904
1905  return out_len;
1906}
1907
1908DECLSPEC u32 rule_op_mangle_dupechar_all (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1909{
1910  if ( in_len           ==  0) return in_len;
1911  if ((in_len + in_len) >= 32) return in_len;
1912
1913  u32 out_len = in_len;
1914
1915  u32 tib40[4];
1916  u32 tib41[4];
1917
1918  tib40[0] = ((buf0[0] & 0x000000FF) <<  0) | ((buf0[0] & 0x0000FF00) <<  8);
1919  tib40[1] = ((buf0[0] & 0x00FF0000) >> 16) | ((buf0[0] & 0xFF000000) >>  8);
1920  tib40[2] = ((buf0[1] & 0x000000FF) <<  0) | ((buf0[1] & 0x0000FF00) <<  8);
1921  tib40[3] = ((buf0[1] & 0x00FF0000) >> 16) | ((buf0[1] & 0xFF000000) >>  8);
1922  tib41[0] = ((buf0[2] & 0x000000FF) <<  0) | ((buf0[2] & 0x0000FF00) <<  8);
1923  tib41[1] = ((buf0[2] & 0x00FF0000) >> 16) | ((buf0[2] & 0xFF000000) >>  8);
1924  tib41[2] = ((buf0[3] & 0x000000FF) <<  0) | ((buf0[3] & 0x0000FF00) <<  8);
1925  tib41[3] = ((buf0[3] & 0x00FF0000) >> 16) | ((buf0[3] & 0xFF000000) >>  8);
1926
1927  buf0[0] = tib40[0] | (tib40[0] <<  8);
1928  buf0[1] = tib40[1] | (tib40[1] <<  8);
1929  buf0[2] = tib40[2] | (tib40[2] <<  8);
1930  buf0[3] = tib40[3] | (tib40[3] <<  8);
1931  buf1[0] = tib41[0] | (tib41[0] <<  8);
1932  buf1[1] = tib41[1] | (tib41[1] <<  8);
1933  buf1[2] = tib41[2] | (tib41[2] <<  8);
1934  buf1[3] = tib41[3] | (tib41[3] <<  8);
1935
1936  out_len = out_len + out_len;
1937
1938  return out_len;
1939}
1940
1941DECLSPEC u32 rule_op_mangle_switch_first (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1942{
1943  if (in_len < 2) return in_len;
1944
1945  buf0[0] = (buf0[0] & 0xFFFF0000) | ((buf0[0] << 8) & 0x0000FF00) | ((buf0[0] >> 8) & 0x000000FF);
1946
1947  return in_len;
1948}
1949
1950DECLSPEC u32 rule_op_mangle_switch_last (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1951{
1952  if (in_len < 2) return in_len;
1953
1954  u32 t[8];
1955
1956  t[0] = buf0[0];
1957  t[1] = buf0[1];
1958  t[2] = buf0[2];
1959  t[3] = buf0[3];
1960  t[4] = buf1[0];
1961  t[5] = buf1[1];
1962  t[6] = buf1[2];
1963  t[7] = buf1[3];
1964
1965  exchange_byte_optimized (t, in_len - 2, in_len - 1);
1966
1967  buf0[0] = t[0];
1968  buf0[1] = t[1];
1969  buf0[2] = t[2];
1970  buf0[3] = t[3];
1971  buf1[0] = t[4];
1972  buf1[1] = t[5];
1973  buf1[2] = t[6];
1974  buf1[3] = t[7];
1975
1976  return in_len;
1977}
1978
1979DECLSPEC u32 rule_op_mangle_switch_at (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
1980{
1981  if (p0 >= in_len) return in_len;
1982  if (p1 >= in_len) return in_len;
1983
1984  u32 t[8];
1985
1986  t[0] = buf0[0];
1987  t[1] = buf0[1];
1988  t[2] = buf0[2];
1989  t[3] = buf0[3];
1990  t[4] = buf1[0];
1991  t[5] = buf1[1];
1992  t[6] = buf1[2];
1993  t[7] = buf1[3];
1994
1995  exchange_byte_optimized (t, p0, p1);
1996
1997  buf0[0] = t[0];
1998  buf0[1] = t[1];
1999  buf0[2] = t[2];
2000  buf0[3] = t[3];
2001  buf1[0] = t[4];
2002  buf1[1] = t[5];
2003  buf1[2] = t[6];
2004  buf1[3] = t[7];
2005
2006  return in_len;
2007}
2008
2009DECLSPEC u32 rule_op_mangle_chr_shiftl (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
2010{
2011  if (p0 >= in_len) return in_len;
2012
2013  const u32 mr = 0xffu << ((p0 & 3) * 8);
2014  const u32 ml = ~mr;
2015
2016  u32 t[8];
2017
2018  t[0] = buf0[0];
2019  t[1] = buf0[1];
2020  t[2] = buf0[2];
2021  t[3] = buf0[3];
2022  t[4] = buf1[0];
2023  t[5] = buf1[1];
2024  t[6] = buf1[2];
2025  t[7] = buf1[3];
2026
2027  const u32 tmp = t[p0 / 4];
2028
2029  t[p0 / 4] = (tmp & ml) | (((tmp & mr) << 1) & mr);
2030
2031  buf0[0] = t[0];
2032  buf0[1] = t[1];
2033  buf0[2] = t[2];
2034  buf0[3] = t[3];
2035  buf1[0] = t[4];
2036  buf1[1] = t[5];
2037  buf1[2] = t[6];
2038  buf1[3] = t[7];
2039
2040  return in_len;
2041}
2042
2043DECLSPEC u32 rule_op_mangle_chr_shiftr (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
2044{
2045  if (p0 >= in_len) return in_len;
2046
2047  const u32 mr = 0xffu << ((p0 & 3) * 8);
2048  const u32 ml = ~mr;
2049
2050  u32 t[8];
2051
2052  t[0] = buf0[0];
2053  t[1] = buf0[1];
2054  t[2] = buf0[2];
2055  t[3] = buf0[3];
2056  t[4] = buf1[0];
2057  t[5] = buf1[1];
2058  t[6] = buf1[2];
2059  t[7] = buf1[3];
2060
2061  const u32 tmp = t[p0 / 4];
2062
2063  t[p0 / 4] = (tmp & ml) | (((tmp & mr) >> 1) & mr);
2064
2065  buf0[0] = t[0];
2066  buf0[1] = t[1];
2067  buf0[2] = t[2];
2068  buf0[3] = t[3];
2069  buf1[0] = t[4];
2070  buf1[1] = t[5];
2071  buf1[2] = t[6];
2072  buf1[3] = t[7];
2073
2074  return in_len;
2075}
2076
2077DECLSPEC u32 rule_op_mangle_chr_incr (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
2078{
2079  if (p0 >= in_len) return in_len;
2080
2081  const u32 mr = 0xffu << ((p0 & 3) * 8);
2082  const u32 ml = ~mr;
2083
2084  const u32 n = 0x01010101 & mr;
2085
2086  u32 t[8];
2087
2088  t[0] = buf0[0];
2089  t[1] = buf0[1];
2090  t[2] = buf0[2];
2091  t[3] = buf0[3];
2092  t[4] = buf1[0];
2093  t[5] = buf1[1];
2094  t[6] = buf1[2];
2095  t[7] = buf1[3];
2096
2097  const u32 tmp = t[p0 / 4];
2098
2099  t[p0 / 4] = (tmp & ml) | (((tmp & mr) + n) & mr);
2100
2101  buf0[0] = t[0];
2102  buf0[1] = t[1];
2103  buf0[2] = t[2];
2104  buf0[3] = t[3];
2105  buf1[0] = t[4];
2106  buf1[1] = t[5];
2107  buf1[2] = t[6];
2108  buf1[3] = t[7];
2109
2110  return in_len;
2111}
2112
2113DECLSPEC u32 rule_op_mangle_chr_decr (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
2114{
2115  if (p0 >= in_len) return in_len;
2116
2117  const u32 mr = 0xffu << ((p0 & 3) * 8);
2118  const u32 ml = ~mr;
2119
2120  const u32 n = 0x01010101 & mr;
2121
2122  u32 t[8];
2123
2124  t[0] = buf0[0];
2125  t[1] = buf0[1];
2126  t[2] = buf0[2];
2127  t[3] = buf0[3];
2128  t[4] = buf1[0];
2129  t[5] = buf1[1];
2130  t[6] = buf1[2];
2131  t[7] = buf1[3];
2132
2133  const u32 tmp = t[p0 / 4];
2134
2135  t[p0 / 4] = (tmp & ml) | (((tmp & mr) - n) & mr);
2136
2137  buf0[0] = t[0];
2138  buf0[1] = t[1];
2139  buf0[2] = t[2];
2140  buf0[3] = t[3];
2141  buf1[0] = t[4];
2142  buf1[1] = t[5];
2143  buf1[2] = t[6];
2144  buf1[3] = t[7];
2145
2146  return in_len;
2147}
2148
2149DECLSPEC u32 rule_op_mangle_replace_np1 (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
2150{
2151  if ((p0 + 1) >= in_len) return in_len;
2152
2153  u32 tib4x[8];
2154
2155  lshift_block_optimized (buf0, buf1, tib4x + 0, tib4x + 4);
2156
2157  const u32 mr = 0xffu << ((p0 & 3) * 8);
2158  const u32 ml = ~mr;
2159
2160  u32 t[8];
2161
2162  t[0] = buf0[0];
2163  t[1] = buf0[1];
2164  t[2] = buf0[2];
2165  t[3] = buf0[3];
2166  t[4] = buf1[0];
2167  t[5] = buf1[1];
2168  t[6] = buf1[2];
2169  t[7] = buf1[3];
2170
2171  const u32 tmp = t[p0 / 4];
2172
2173  const u32 tmp2 = tib4x[p0 / 4];
2174
2175  t[p0 / 4] = (tmp & ml) | (tmp2 & mr);
2176
2177  buf0[0] = t[0];
2178  buf0[1] = t[1];
2179  buf0[2] = t[2];
2180  buf0[3] = t[3];
2181  buf1[0] = t[4];
2182  buf1[1] = t[5];
2183  buf1[2] = t[6];
2184  buf1[3] = t[7];
2185
2186  return in_len;
2187}
2188
2189DECLSPEC u32 rule_op_mangle_replace_nm1 (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
2190{
2191  if (p0 == 0) return in_len;
2192
2193  if (p0 >= in_len) return in_len;
2194
2195  u32 tib4x[8];
2196
2197  rshift_block_optimized (buf0, buf1, tib4x + 0, tib4x + 4);
2198
2199  const u32 mr = 0xffu << ((p0 & 3) * 8);
2200  const u32 ml = ~mr;
2201
2202  u32 t[8];
2203
2204  t[0] = buf0[0];
2205  t[1] = buf0[1];
2206  t[2] = buf0[2];
2207  t[3] = buf0[3];
2208  t[4] = buf1[0];
2209  t[5] = buf1[1];
2210  t[6] = buf1[2];
2211  t[7] = buf1[3];
2212
2213  const u32 tmp = t[p0 / 4];
2214
2215  const u32 tmp2 = tib4x[p0 / 4];
2216
2217  t[p0 / 4] = (tmp & ml) | (tmp2 & mr);
2218
2219  buf0[0] = t[0];
2220  buf0[1] = t[1];
2221  buf0[2] = t[2];
2222  buf0[3] = t[3];
2223  buf1[0] = t[4];
2224  buf1[1] = t[5];
2225  buf1[2] = t[6];
2226  buf1[3] = t[7];
2227
2228  return in_len;
2229}
2230
2231DECLSPEC u32 rule_op_mangle_dupeblock_first (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
2232{
2233  if (p0 > in_len) return in_len;
2234
2235  if ((in_len + p0) >= 32) return in_len;
2236
2237  u32 out_len = in_len;
2238
2239  u32 tib40[4];
2240  u32 tib41[4];
2241
2242  tib40[0] = buf0[0];
2243  tib40[1] = buf0[1];
2244  tib40[2] = buf0[2];
2245  tib40[3] = buf0[3];
2246  tib41[0] = buf1[0];
2247  tib41[1] = buf1[1];
2248  tib41[2] = buf1[2];
2249  tib41[3] = buf1[3];
2250
2251  truncate_right_optimized (tib40, tib41, p0);
2252
2253  rshift_block_optimized_N (buf0, buf1, buf0, buf1, p0);
2254
2255  buf0[0] |= tib40[0];
2256  buf0[1] |= tib40[1];
2257  buf0[2] |= tib40[2];
2258  buf0[3] |= tib40[3];
2259  buf1[0] |= tib41[0];
2260  buf1[1] |= tib41[1];
2261  buf1[2] |= tib41[2];
2262  buf1[3] |= tib41[3];
2263
2264  out_len += p0;
2265
2266  return out_len;
2267}
2268
2269DECLSPEC u32 rule_op_mangle_dupeblock_last (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
2270{
2271  if (p0 > in_len) return in_len;
2272
2273  if ((in_len + p0) >= 32) return in_len;
2274
2275  u32 out_len = in_len;
2276
2277  u32 tib40[4];
2278  u32 tib41[4];
2279
2280  rshift_block_optimized_N (buf0, buf1, tib40, tib41, p0);
2281
2282  truncate_left_optimized (tib40, tib41, out_len);
2283
2284  buf0[0] |= tib40[0];
2285  buf0[1] |= tib40[1];
2286  buf0[2] |= tib40[2];
2287  buf0[3] |= tib40[3];
2288  buf1[0] |= tib41[0];
2289  buf1[1] |= tib41[1];
2290  buf1[2] |= tib41[2];
2291  buf1[3] |= tib41[3];
2292
2293  out_len += p0;
2294
2295  return out_len;
2296}
2297
2298DECLSPEC u32 toggle_on_register (const u32 in, const u32 r)
2299{
2300  u32 out = in;
2301
2302  const u32 cmask = generate_cmask_optimized (out);
2303
2304  if (r & 1) out = out ^ (0x00000020 & cmask);
2305  if (r & 2) out = out ^ (0x00002000 & cmask);
2306  if (r & 4) out = out ^ (0x00200000 & cmask);
2307  if (r & 8) out = out ^ (0x20000000 & cmask);
2308
2309  return out;
2310}
2311
2312DECLSPEC u32 rule_op_mangle_title_sep (MAYBE_UNUSED const u32 p0, MAYBE_UNUSED const u32 p1, MAYBE_UNUSED u32 *buf0, MAYBE_UNUSED u32 *buf1, const u32 in_len)
2313{
2314  if (in_len == 0) return in_len;
2315
2316  u32 r0 = search_on_register (buf0[0], p0);
2317  u32 r1 = search_on_register (buf0[1], p0);
2318  u32 r2 = search_on_register (buf0[2], p0);
2319  u32 r3 = search_on_register (buf0[3], p0);
2320  u32 r4 = search_on_register (buf1[0], p0);
2321  u32 r5 = search_on_register (buf1[1], p0);
2322  u32 r6 = search_on_register (buf1[2], p0);
2323  u32 r7 = search_on_register (buf1[3], p0);
2324
2325  rule_op_mangle_lrest_ufirst (p0, p1, buf0, buf1, in_len);
2326
2327  const u32 rn = r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7;
2328
2329  if (rn == 0) return in_len;
2330
2331  r0 <<= 1;
2332  r1 <<= 1; r1 |= r0 >> 4;
2333  r2 <<= 1; r2 |= r1 >> 4;
2334  r3 <<= 1; r3 |= r2 >> 4;
2335  r4 <<= 1; r4 |= r3 >> 4;
2336  r5 <<= 1; r5 |= r4 >> 4;
2337  r6 <<= 1; r6 |= r5 >> 4;
2338  r7 <<= 1; r7 |= r6 >> 4;
2339
2340  buf0[0] = toggle_on_register (buf0[0], r0);
2341  buf0[1] = toggle_on_register (buf0[1], r1);
2342  buf0[2] = toggle_on_register (buf0[2], r2);
2343  buf0[3] = toggle_on_register (buf0[3], r3);
2344  buf1[0] = toggle_on_register (buf1[0], r4);
2345  buf1[1] = toggle_on_register (buf1[1], r5);
2346  buf1[2] = toggle_on_register (buf1[2], r6);
2347  buf1[3] = toggle_on_register (buf1[3], r7);
2348
2349  return in_len;
2350}
2351
2352DECLSPEC u32 apply_rule_optimized (const u32 name, const u32 p0, const u32 p1, u32 *buf0, u32 *buf1, const u32 in_len)
2353{
2354  u32 out_len = in_len;
2355
2356  switch (name)
2357  {
2358    case RULE_OP_MANGLE_LREST:            out_len = rule_op_mangle_lrest            (p0, p1, buf0, buf1, out_len); break;
2359    case RULE_OP_MANGLE_UREST:            out_len = rule_op_mangle_urest            (p0, p1, buf0, buf1, out_len); break;
2360    case RULE_OP_MANGLE_LREST_UFIRST:     out_len = rule_op_mangle_lrest_ufirst     (p0, p1, buf0, buf1, out_len); break;
2361    case RULE_OP_MANGLE_UREST_LFIRST:     out_len = rule_op_mangle_urest_lfirst     (p0, p1, buf0, buf1, out_len); break;
2362    case RULE_OP_MANGLE_TREST:            out_len = rule_op_mangle_trest            (p0, p1, buf0, buf1, out_len); break;
2363    case RULE_OP_MANGLE_TOGGLE_AT:        out_len = rule_op_mangle_toggle_at        (p0, p1, buf0, buf1, out_len); break;
2364    case RULE_OP_MANGLE_TOGGLE_AT_SEP:    out_len = rule_op_mangle_toggle_at_sep    (p0, p1, buf0, buf1, out_len); break;
2365    case RULE_OP_MANGLE_REVERSE:          out_len = rule_op_mangle_reverse          (p0, p1, buf0, buf1, out_len); break;
2366    case RULE_OP_MANGLE_DUPEWORD:         out_len = rule_op_mangle_dupeword         (p0, p1, buf0, buf1, out_len); break;
2367    case RULE_OP_MANGLE_DUPEWORD_TIMES:   out_len = rule_op_mangle_dupeword_times   (p0, p1, buf0, buf1, out_len); break;
2368    case RULE_OP_MANGLE_REFLECT:          out_len = rule_op_mangle_reflect          (p0, p1, buf0, buf1, out_len); break;
2369    case RULE_OP_MANGLE_APPEND:           out_len = rule_op_mangle_append           (p0, p1, buf0, buf1, out_len); break;
2370    case RULE_OP_MANGLE_PREPEND:          out_len = rule_op_mangle_prepend          (p0, p1, buf0, buf1, out_len); break;
2371    case RULE_OP_MANGLE_ROTATE_LEFT:      out_len = rule_op_mangle_rotate_left      (p0, p1, buf0, buf1, out_len); break;
2372    case RULE_OP_MANGLE_ROTATE_RIGHT:     out_len = rule_op_mangle_rotate_right     (p0, p1, buf0, buf1, out_len); break;
2373    case RULE_OP_MANGLE_DELETE_FIRST:     out_len = rule_op_mangle_delete_first     (p0, p1, buf0, buf1, out_len); break;
2374    case RULE_OP_MANGLE_DELETE_LAST:      out_len = rule_op_mangle_delete_last      (p0, p1, buf0, buf1, out_len); break;
2375    case RULE_OP_MANGLE_DELETE_AT:        out_len = rule_op_mangle_delete_at        (p0, p1, buf0, buf1, out_len); break;
2376    case RULE_OP_MANGLE_EXTRACT:          out_len = rule_op_mangle_extract          (p0, p1, buf0, buf1, out_len); break;
2377    case RULE_OP_MANGLE_OMIT:             out_len = rule_op_mangle_omit             (p0, p1, buf0, buf1, out_len); break;
2378    case RULE_OP_MANGLE_INSERT:           out_len = rule_op_mangle_insert           (p0, p1, buf0, buf1, out_len); break;
2379    case RULE_OP_MANGLE_OVERSTRIKE:       out_len = rule_op_mangle_overstrike       (p0, p1, buf0, buf1, out_len); break;
2380    case RULE_OP_MANGLE_TRUNCATE_AT:      out_len = rule_op_mangle_truncate_at      (p0, p1, buf0, buf1, out_len); break;
2381    case RULE_OP_MANGLE_REPLACE:          out_len = rule_op_mangle_replace          (p0, p1, buf0, buf1, out_len); break;
2382    case RULE_OP_MANGLE_PURGECHAR:        out_len = rule_op_mangle_purgechar        (p0, p1, buf0, buf1, out_len); break;
2383    //case RULE_OP_MANGLE_TOGGLECASE_REC:   out_len = rule_op_mangle_togglecase_rec   (p0, p1, buf0, buf1, out_len); break;
2384    case RULE_OP_MANGLE_DUPECHAR_FIRST:   out_len = rule_op_mangle_dupechar_first   (p0, p1, buf0, buf1, out_len); break;
2385    case RULE_OP_MANGLE_DUPECHAR_LAST:    out_len = rule_op_mangle_dupechar_last    (p0, p1, buf0, buf1, out_len); break;
2386    case RULE_OP_MANGLE_DUPECHAR_ALL:     out_len = rule_op_mangle_dupechar_all     (p0, p1, buf0, buf1, out_len); break;
2387    case RULE_OP_MANGLE_SWITCH_FIRST:     out_len = rule_op_mangle_switch_first     (p0, p1, buf0, buf1, out_len); break;
2388    case RULE_OP_MANGLE_SWITCH_LAST:      out_len = rule_op_mangle_switch_last      (p0, p1, buf0, buf1, out_len); break;
2389    case RULE_OP_MANGLE_SWITCH_AT:        out_len = rule_op_mangle_switch_at        (p0, p1, buf0, buf1, out_len); break;
2390    case RULE_OP_MANGLE_CHR_SHIFTL:       out_len = rule_op_mangle_chr_shiftl       (p0, p1, buf0, buf1, out_len); break;
2391    case RULE_OP_MANGLE_CHR_SHIFTR:       out_len = rule_op_mangle_chr_shiftr       (p0, p1, buf0, buf1, out_len); break;
2392    case RULE_OP_MANGLE_CHR_INCR:         out_len = rule_op_mangle_chr_incr         (p0, p1, buf0, buf1, out_len); break;
2393    case RULE_OP_MANGLE_CHR_DECR:         out_len = rule_op_mangle_chr_decr         (p0, p1, buf0, buf1, out_len); break;
2394    case RULE_OP_MANGLE_REPLACE_NP1:      out_len = rule_op_mangle_replace_np1      (p0, p1, buf0, buf1, out_len); break;
2395    case RULE_OP_MANGLE_REPLACE_NM1:      out_len = rule_op_mangle_replace_nm1      (p0, p1, buf0, buf1, out_len); break;
2396    case RULE_OP_MANGLE_DUPEBLOCK_FIRST:  out_len = rule_op_mangle_dupeblock_first  (p0, p1, buf0, buf1, out_len); break;
2397    case RULE_OP_MANGLE_DUPEBLOCK_LAST:   out_len = rule_op_mangle_dupeblock_last   (p0, p1, buf0, buf1, out_len); break;
2398    case RULE_OP_MANGLE_TITLE_SEP:        out_len = rule_op_mangle_title_sep        (p0, p1, buf0, buf1, out_len); break;
2399    case RULE_OP_MANGLE_TITLE:            out_len = rule_op_mangle_title_sep        (' ', p1, buf0, buf1, out_len); break;
2400  }
2401
2402  return out_len;
2403}
2404
2405DECLSPEC u32 apply_rules_optimized (CONSTANT_AS const u32 *cmds, u32 *buf0, u32 *buf1, const u32 len)
2406{
2407  u32 out_len = len;
2408
2409  for (u32 i = 0; cmds[i] != 0; i++)
2410  {
2411    const u32 cmd = cmds[i];
2412
2413    const u32 name = (cmd >>  0) & 0xff;
2414    const u32 p0   = (cmd >>  8) & 0xff;
2415    const u32 p1   = (cmd >> 16) & 0xff;
2416
2417    // we need to guarantee input length < 32 otherwise functions like rule_op_mangle_switch_last() and others will read out of boundary
2418    out_len = apply_rule_optimized (name, p0, p1, buf0, buf1, out_len);
2419  }
2420
2421  return out_len;
2422}
2423
2424DECLSPEC u32x apply_rules_vect_optimized (const u32 *pw_buf0, const u32 *pw_buf1, const u32 pw_len, CONSTANT_AS const kernel_rule_t *kernel_rules, const u32 il_pos, u32x *buf0, u32x *buf1)
2425{
2426  #if VECT_SIZE == 1
2427
2428  buf0[0] = pw_buf0[0];
2429  buf0[1] = pw_buf0[1];
2430  buf0[2] = pw_buf0[2];
2431  buf0[3] = pw_buf0[3];
2432  buf1[0] = pw_buf1[0];
2433  buf1[1] = pw_buf1[1];
2434  buf1[2] = pw_buf1[2];
2435  buf1[3] = pw_buf1[3];
2436
2437  return apply_rules_optimized (kernel_rules[il_pos].cmds, buf0, buf1, pw_len);
2438
2439  #else
2440
2441  u32x out_len = 0;
2442
2443  #ifdef _unroll
2444  #pragma unroll
2445  #endif
2446  for (int i = 0; i < VECT_SIZE; i++)
2447  {
2448    u32 tmp0[4];
2449    u32 tmp1[4];
2450
2451    tmp0[0] = pw_buf0[0];
2452    tmp0[1] = pw_buf0[1];
2453    tmp0[2] = pw_buf0[2];
2454    tmp0[3] = pw_buf0[3];
2455    tmp1[0] = pw_buf1[0];
2456    tmp1[1] = pw_buf1[1];
2457    tmp1[2] = pw_buf1[2];
2458    tmp1[3] = pw_buf1[3];
2459
2460    const u32 tmp_len = apply_rules_optimized (kernel_rules[il_pos + i].cmds, tmp0, tmp1, pw_len);
2461
2462    switch (i)
2463    {
2464      #if VECT_SIZE >= 2
2465      case 0:
2466        buf0[0].s0 = tmp0[0];
2467        buf0[1].s0 = tmp0[1];
2468        buf0[2].s0 = tmp0[2];
2469        buf0[3].s0 = tmp0[3];
2470        buf1[0].s0 = tmp1[0];
2471        buf1[1].s0 = tmp1[1];
2472        buf1[2].s0 = tmp1[2];
2473        buf1[3].s0 = tmp1[3];
2474        out_len.s0 = tmp_len;
2475        break;
2476
2477      case 1:
2478        buf0[0].s1 = tmp0[0];
2479        buf0[1].s1 = tmp0[1];
2480        buf0[2].s1 = tmp0[2];
2481        buf0[3].s1 = tmp0[3];
2482        buf1[0].s1 = tmp1[0];
2483        buf1[1].s1 = tmp1[1];
2484        buf1[2].s1 = tmp1[2];
2485        buf1[3].s1 = tmp1[3];
2486        out_len.s1 = tmp_len;
2487        break;
2488      #endif
2489
2490      #if VECT_SIZE >= 4
2491      case 2:
2492        buf0[0].s2 = tmp0[0];
2493        buf0[1].s2 = tmp0[1];
2494        buf0[2].s2 = tmp0[2];
2495        buf0[3].s2 = tmp0[3];
2496        buf1[0].s2 = tmp1[0];
2497        buf1[1].s2 = tmp1[1];
2498        buf1[2].s2 = tmp1[2];
2499        buf1[3].s2 = tmp1[3];
2500        out_len.s2 = tmp_len;
2501        break;
2502
2503      case 3:
2504        buf0[0].s3 = tmp0[0];
2505        buf0[1].s3 = tmp0[1];
2506        buf0[2].s3 = tmp0[2];
2507        buf0[3].s3 = tmp0[3];
2508        buf1[0].s3 = tmp1[0];
2509        buf1[1].s3 = tmp1[1];
2510        buf1[2].s3 = tmp1[2];
2511        buf1[3].s3 = tmp1[3];
2512        out_len.s3 = tmp_len;
2513        break;
2514      #endif
2515
2516      #if VECT_SIZE >= 8
2517      case 4:
2518        buf0[0].s4 = tmp0[0];
2519        buf0[1].s4 = tmp0[1];
2520        buf0[2].s4 = tmp0[2];
2521        buf0[3].s4 = tmp0[3];
2522        buf1[0].s4 = tmp1[0];
2523        buf1[1].s4 = tmp1[1];
2524        buf1[2].s4 = tmp1[2];
2525        buf1[3].s4 = tmp1[3];
2526        out_len.s4 = tmp_len;
2527        break;
2528
2529      case 5:
2530        buf0[0].s5 = tmp0[0];
2531        buf0[1].s5 = tmp0[1];
2532        buf0[2].s5 = tmp0[2];
2533        buf0[3].s5 = tmp0[3];
2534        buf1[0].s5 = tmp1[0];
2535        buf1[1].s5 = tmp1[1];
2536        buf1[2].s5 = tmp1[2];
2537        buf1[3].s5 = tmp1[3];
2538        out_len.s5 = tmp_len;
2539        break;
2540
2541      case 6:
2542        buf0[0].s6 = tmp0[0];
2543        buf0[1].s6 = tmp0[1];
2544        buf0[2].s6 = tmp0[2];
2545        buf0[3].s6 = tmp0[3];
2546        buf1[0].s6 = tmp1[0];
2547        buf1[1].s6 = tmp1[1];
2548        buf1[2].s6 = tmp1[2];
2549        buf1[3].s6 = tmp1[3];
2550        out_len.s6 = tmp_len;
2551        break;
2552
2553      case 7:
2554        buf0[0].s7 = tmp0[0];
2555        buf0[1].s7 = tmp0[1];
2556        buf0[2].s7 = tmp0[2];
2557        buf0[3].s7 = tmp0[3];
2558        buf1[0].s7 = tmp1[0];
2559        buf1[1].s7 = tmp1[1];
2560        buf1[2].s7 = tmp1[2];
2561        buf1[3].s7 = tmp1[3];
2562        out_len.s7 = tmp_len;
2563        break;
2564      #endif
2565
2566      #if VECT_SIZE >= 16
2567      case 8:
2568        buf0[0].s8 = tmp0[0];
2569        buf0[1].s8 = tmp0[1];
2570        buf0[2].s8 = tmp0[2];
2571        buf0[3].s8 = tmp0[3];
2572        buf1[0].s8 = tmp1[0];
2573        buf1[1].s8 = tmp1[1];
2574        buf1[2].s8 = tmp1[2];
2575        buf1[3].s8 = tmp1[3];
2576        out_len.s8 = tmp_len;
2577        break;
2578
2579      case 9:
2580        buf0[0].s9 = tmp0[0];
2581        buf0[1].s9 = tmp0[1];
2582        buf0[2].s9 = tmp0[2];
2583        buf0[3].s9 = tmp0[3];
2584        buf1[0].s9 = tmp1[0];
2585        buf1[1].s9 = tmp1[1];
2586        buf1[2].s9 = tmp1[2];
2587        buf1[3].s9 = tmp1[3];
2588        out_len.s9 = tmp_len;
2589        break;
2590
2591      case 10:
2592        buf0[0].sa = tmp0[0];
2593        buf0[1].sa = tmp0[1];
2594        buf0[2].sa = tmp0[2];
2595        buf0[3].sa = tmp0[3];
2596        buf1[0].sa = tmp1[0];
2597        buf1[1].sa = tmp1[1];
2598        buf1[2].sa = tmp1[2];
2599        buf1[3].sa = tmp1[3];
2600        out_len.sa = tmp_len;
2601        break;
2602
2603      case 11:
2604        buf0[0].sb = tmp0[0];
2605        buf0[1].sb = tmp0[1];
2606        buf0[2].sb = tmp0[2];
2607        buf0[3].sb = tmp0[3];
2608        buf1[0].sb = tmp1[0];
2609        buf1[1].sb = tmp1[1];
2610        buf1[2].sb = tmp1[2];
2611        buf1[3].sb = tmp1[3];
2612        out_len.sb = tmp_len;
2613        break;
2614
2615      case 12:
2616        buf0[0].sc = tmp0[0];
2617        buf0[1].sc = tmp0[1];
2618        buf0[2].sc = tmp0[2];
2619        buf0[3].sc = tmp0[3];
2620        buf1[0].sc = tmp1[0];
2621        buf1[1].sc = tmp1[1];
2622        buf1[2].sc = tmp1[2];
2623        buf1[3].sc = tmp1[3];
2624        out_len.sc = tmp_len;
2625        break;
2626
2627      case 13:
2628        buf0[0].sd = tmp0[0];
2629        buf0[1].sd = tmp0[1];
2630        buf0[2].sd = tmp0[2];
2631        buf0[3].sd = tmp0[3];
2632        buf1[0].sd = tmp1[0];
2633        buf1[1].sd = tmp1[1];
2634        buf1[2].sd = tmp1[2];
2635        buf1[3].sd = tmp1[3];
2636        out_len.sd = tmp_len;
2637        break;
2638
2639      case 14:
2640        buf0[0].se = tmp0[0];
2641        buf0[1].se = tmp0[1];
2642        buf0[2].se = tmp0[2];
2643        buf0[3].se = tmp0[3];
2644        buf1[0].se = tmp1[0];
2645        buf1[1].se = tmp1[1];
2646        buf1[2].se = tmp1[2];
2647        buf1[3].se = tmp1[3];
2648        out_len.se = tmp_len;
2649        break;
2650
2651      case 15:
2652        buf0[0].sf = tmp0[0];
2653        buf0[1].sf = tmp0[1];
2654        buf0[2].sf = tmp0[2];
2655        buf0[3].sf = tmp0[3];
2656        buf1[0].sf = tmp1[0];
2657        buf1[1].sf = tmp1[1];
2658        buf1[2].sf = tmp1[2];
2659        buf1[3].sf = tmp1[3];
2660        out_len.sf = tmp_len;
2661        break;
2662      #endif
2663    }
2664  }
2665
2666  return out_len;
2667
2668  #endif
2669}
2670