1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9                     This module by Zoltan Herczeg
10      Original API code Copyright (c) 1997-2012 University of Cambridge
11           New API code Copyright (c) 2016-2019 University of Cambridge
12 
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
16 
17     * Redistributions of source code must retain the above copyright notice,
18       this list of conditions and the following disclaimer.
19 
20     * Redistributions in binary form must reproduce the above copyright
21       notice, this list of conditions and the following disclaimer in the
22       documentation and/or other materials provided with the distribution.
23 
from_u8(n: u8) -> Option<DisposalMethod>24     * Neither the name of the University of Cambridge nor the names of its
25       contributors may be used to endorse or promote products derived from
26       this software without specific prior written permission.
27 
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
40 */
41 
42 #if !(defined SUPPORT_VALGRIND)
43 
44 #if ((defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
45      || (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X))
46 
47 typedef enum {
48   vector_compare_match1,
49   vector_compare_match1i,
50   vector_compare_match2,
51 } vector_compare_type;
52 
53 static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void)
54 {
55 #if PCRE2_CODE_UNIT_WIDTH == 8
56 return 15;
57 #elif PCRE2_CODE_UNIT_WIDTH == 16
from_u8(n: u8) -> Option<Block>58 return 7;
59 #elif PCRE2_CODE_UNIT_WIDTH == 32
60 return 3;
61 #else
62 #error "Unsupported unit width"
63 #endif
64 }
65 
66 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
67 static struct sljit_jump *jump_if_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg)
68 {
69 #if PCRE2_CODE_UNIT_WIDTH == 8
70 OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xc0);
71 return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0x80);
72 #elif PCRE2_CODE_UNIT_WIDTH == 16
73 OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xfc00);
74 return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0xdc00);
75 #else
76 #error "Unknown code width"
77 #endif
78 }
79 #endif
80 
81 #endif /* SLJIT_CONFIG_X86 || SLJIT_CONFIG_S390X */
82 
83 #if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
84 
85 static sljit_s32 character_to_int32(PCRE2_UCHAR chr)
86 {
87 sljit_u32 value = chr;
88 #if PCRE2_CODE_UNIT_WIDTH == 8
89 #define SSE2_COMPARE_TYPE_INDEX 0
90 return (sljit_s32)((value << 24) | (value << 16) | (value << 8) | value);
91 #elif PCRE2_CODE_UNIT_WIDTH == 16
92 #define SSE2_COMPARE_TYPE_INDEX 1
93 return (sljit_s32)((value << 16) | value);
94 #elif PCRE2_CODE_UNIT_WIDTH == 32
95 #define SSE2_COMPARE_TYPE_INDEX 2
96 return (sljit_s32)(value);
97 #else
98 #error "Unsupported unit width"
99 #endif
100 }
101 
102 static void load_from_mem_sse2(struct sljit_compiler *compiler, sljit_s32 dst_xmm_reg, sljit_s32 src_general_reg, sljit_s8 offset)
103 {
104 sljit_u8 instruction[5];
105 
106 SLJIT_ASSERT(dst_xmm_reg < 8);
107 SLJIT_ASSERT(src_general_reg < 8);
into_known(self) -> Option<Extension>108 
109 /* MOVDQA xmm1, xmm2/m128 */
110 instruction[0] = ((sljit_u8)offset & 0xf) == 0 ? 0x66 : 0xf3;
111 instruction[1] = 0x0f;
112 instruction[2] = 0x6f;
113 
from(ext: Extension) -> Self114 if (offset == 0)
115   {
116   instruction[3] = (dst_xmm_reg << 3) | src_general_reg;
117   sljit_emit_op_custom(compiler, instruction, 4);
118   return;
119   }
120 
from_u8(n: u8) -> Option<Extension>121 instruction[3] = 0x40 | (dst_xmm_reg << 3) | src_general_reg;
122 instruction[4] = (sljit_u8)offset;
123 sljit_emit_op_custom(compiler, instruction, 5);
124 }
125 
126 static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
127   int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
128 {
129 sljit_u8 instruction[4];
130 instruction[0] = 0x66;
131 instruction[1] = 0x0f;
132 
133 SLJIT_ASSERT(step >= 0 && step <= 3);
134 
135 if (compare_type != vector_compare_match2)
136   {
137   if (step == 0)
138     {
139     if (compare_type == vector_compare_match1i)
140       {
141       /* POR xmm1, xmm2/m128 */
142       /* instruction[0] = 0x66; */
143       /* instruction[1] = 0x0f; */
144       instruction[2] = 0xeb;
145       instruction[3] = 0xc0 | (dst_ind << 3) | cmp2_ind;
146       sljit_emit_op_custom(compiler, instruction, 4);
147       }
148     return;
149     }
150 
151   if (step != 2)
152     return;
153 
154   /* PCMPEQB/W/D xmm1, xmm2/m128 */
155   /* instruction[0] = 0x66; */
156   /* instruction[1] = 0x0f; */
157   instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
158   instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
159   sljit_emit_op_custom(compiler, instruction, 4);
160   return;
default() -> Frame<'a>161   }
162 
163 switch (step)
164   {
165   case 0:
166   /* MOVDQA xmm1, xmm2/m128 */
167   /* instruction[0] = 0x66; */
168   /* instruction[1] = 0x0f; */
169   instruction[2] = 0x6f;
170   instruction[3] = 0xc0 | (tmp_ind << 3) | dst_ind;
171   sljit_emit_op_custom(compiler, instruction, 4);
172   return;
173 
174   case 1:
175   /* PCMPEQB/W/D xmm1, xmm2/m128 */
176   /* instruction[0] = 0x66; */
177   /* instruction[1] = 0x0f; */
178   instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
179   instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
180   sljit_emit_op_custom(compiler, instruction, 4);
181   return;
182 
183   case 2:
184   /* PCMPEQB/W/D xmm1, xmm2/m128 */
185   /* instruction[0] = 0x66; */
186   /* instruction[1] = 0x0f; */
187   instruction[2] = 0x74 + SSE2_COMPARE_TYPE_INDEX;
188   instruction[3] = 0xc0 | (tmp_ind << 3) | cmp2_ind;
189   sljit_emit_op_custom(compiler, instruction, 4);
190   return;
191 
192   case 3:
193   /* POR xmm1, xmm2/m128 */
194   /* instruction[0] = 0x66; */
195   /* instruction[1] = 0x0f; */
196   instruction[2] = 0xeb;
197   instruction[3] = 0xc0 | (dst_ind << 3) | tmp_ind;
198   sljit_emit_op_custom(compiler, instruction, 4);
199   return;
200   }
201 }
202 
203 #define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
204 
205 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
206 {
207 DEFINE_COMPILER;
from_rgba_speed(width: u16, height: u16, pixels: &mut [u8], speed: i32) -> Frame<'static>208 sljit_u8 instruction[8];
209 struct sljit_label *start;
210 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
211 struct sljit_label *restart;
212 #endif
213 struct sljit_jump *quit;
214 struct sljit_jump *partial_quit[2];
215 vector_compare_type compare_type = vector_compare_match1;
216 sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
217 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
218 sljit_s32 data_ind = 0;
219 sljit_s32 tmp_ind = 1;
220 sljit_s32 cmp1_ind = 2;
221 sljit_s32 cmp2_ind = 3;
222 sljit_u32 bit = 0;
223 int i;
224 
225 SLJIT_UNUSED_ARG(offset);
226 
227 if (char1 != char2)
228   {
229   bit = char1 ^ char2;
230   compare_type = vector_compare_match1i;
231 
232   if (!is_powerof2(bit))
233     {
234     bit = 0;
235     compare_type = vector_compare_match2;
236     }
237   }
238 
239 partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
240 if (common->mode == PCRE2_JIT_COMPLETE)
241   add_jump(compiler, &common->failed_match, partial_quit[0]);
242 
243 /* First part (unaligned start) */
244 
245 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
246 
247 SLJIT_ASSERT(tmp1_reg_ind < 8);
248 
249 /* MOVD xmm, r/m32 */
250 instruction[0] = 0x66;
251 instruction[1] = 0x0f;
252 instruction[2] = 0x6e;
253 instruction[3] = 0xc0 | (cmp1_ind << 3) | tmp1_reg_ind;
254 sljit_emit_op_custom(compiler, instruction, 4);
255 
256 if (char1 != char2)
257   {
258   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
259 
260   /* MOVD xmm, r/m32 */
261   instruction[3] = 0xc0 | (cmp2_ind << 3) | tmp1_reg_ind;
262   sljit_emit_op_custom(compiler, instruction, 4);
from_palette_pixels(width: u16, height: u16, pixels: &[u8], palette: &[u8], transparent: Option<u8>) -> Frame<'static>263   }
264 
265 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
266 
267 /* PSHUFD xmm1, xmm2/m128, imm8 */
268 /* instruction[0] = 0x66; */
269 /* instruction[1] = 0x0f; */
270 instruction[2] = 0x70;
271 instruction[3] = 0xc0 | (cmp1_ind << 3) | cmp1_ind;
272 instruction[4] = 0;
273 sljit_emit_op_custom(compiler, instruction, 5);
274 
275 if (char1 != char2)
276   {
277   /* PSHUFD xmm1, xmm2/m128, imm8 */
278   instruction[3] = 0xc0 | (cmp2_ind << 3) | cmp2_ind;
279   sljit_emit_op_custom(compiler, instruction, 5);
280   }
from_indexed_pixels(width: u16, height: u16, pixels: &[u8], transparent: Option<u8>) -> Frame<'static>281 
282 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
283 restart = LABEL();
284 #endif
285 OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
286 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
287 
288 load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
289 for (i = 0; i < 4; i++)
290   fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
291 
292 /* PMOVMSKB reg, xmm */
293 /* instruction[0] = 0x66; */
294 /* instruction[1] = 0x0f; */
295 instruction[2] = 0xd7;
296 instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
297 sljit_emit_op_custom(compiler, instruction, 4);
298 
299 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
300 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
301 
302 quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
303 
from_rgb(width: u16, height: u16, pixels: &[u8]) -> Frame<'static>304 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
305 
306 /* Second part (aligned) */
307 start = LABEL();
308 
309 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
310 
311 partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
312 if (common->mode == PCRE2_JIT_COMPLETE)
313   add_jump(compiler, &common->failed_match, partial_quit[1]);
314 
315 load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
316 for (i = 0; i < 4; i++)
317   fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
318 
319 /* PMOVMSKB reg, xmm */
320 /* instruction[0] = 0x66; */
321 /* instruction[1] = 0x0f; */
from_rgb_speed(width: u16, height: u16, pixels: &[u8], speed: i32) -> Frame<'static>322 instruction[2] = 0xd7;
323 instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
324 sljit_emit_op_custom(compiler, instruction, 4);
325 
326 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
327 
328 JUMPHERE(quit);
329 
330 /* BSF r32, r/m32 */
required_bytes(&self) -> usize331 instruction[0] = 0x0f;
332 instruction[1] = 0xbc;
333 instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
334 sljit_emit_op_custom(compiler, instruction, 3);
335 
336 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
337 
338 if (common->mode != PCRE2_JIT_COMPLETE)
339   {
rgba_speed_avoid_panic_256_colors()340   JUMPHERE(partial_quit[0]);
341   JUMPHERE(partial_quit[1]);
342   OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_PTR, 0, STR_END, 0);
343   CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0);
344   }
345 else
346   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
347 
348 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
349 if (common->utf && offset > 0)
350   {
351   SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
352 
353   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
354 
355   quit = jump_if_utf_char_start(compiler, TMP1);
356 
357   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
358   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
359   OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
360   JUMPTO(SLJIT_JUMP, restart);
361 
362   JUMPHERE(quit);
363   }
364 #endif
365 }
366 
367 #define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
368 
369 static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
370 {
371 DEFINE_COMPILER;
372 sljit_u8 instruction[8];
373 struct sljit_label *start;
374 struct sljit_jump *quit;
375 jump_list *not_found = NULL;
376 vector_compare_type compare_type = vector_compare_match1;
377 sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
378 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
379 sljit_s32 data_ind = 0;
380 sljit_s32 tmp_ind = 1;
381 sljit_s32 cmp1_ind = 2;
382 sljit_s32 cmp2_ind = 3;
383 sljit_u32 bit = 0;
384 int i;
385 
386 if (char1 != char2)
387   {
388   bit = char1 ^ char2;
389   compare_type = vector_compare_match1i;
390 
391   if (!is_powerof2(bit))
392     {
393     bit = 0;
394     compare_type = vector_compare_match2;
395     }
396   }
397 
398 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
399 OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
400 OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
401 
402 /* First part (unaligned start) */
403 
404 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
405 
406 SLJIT_ASSERT(tmp1_reg_ind < 8);
407 
408 /* MOVD xmm, r/m32 */
409 instruction[0] = 0x66;
410 instruction[1] = 0x0f;
411 instruction[2] = 0x6e;
412 instruction[3] = 0xc0 | (cmp1_ind << 3) | tmp1_reg_ind;
413 sljit_emit_op_custom(compiler, instruction, 4);
414 
415 if (char1 != char2)
416   {
417   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
418 
419   /* MOVD xmm, r/m32 */
420   instruction[3] = 0xc0 | (cmp2_ind << 3) | tmp1_reg_ind;
421   sljit_emit_op_custom(compiler, instruction, 4);
422   }
423 
424 OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
425 
426 /* PSHUFD xmm1, xmm2/m128, imm8 */
427 /* instruction[0] = 0x66; */
428 /* instruction[1] = 0x0f; */
429 instruction[2] = 0x70;
430 instruction[3] = 0xc0 | (cmp1_ind << 3) | cmp1_ind;
431 instruction[4] = 0;
432 sljit_emit_op_custom(compiler, instruction, 5);
433 
434 if (char1 != char2)
435   {
436   /* PSHUFD xmm1, xmm2/m128, imm8 */
437   instruction[3] = 0xc0 | (cmp2_ind << 3) | cmp2_ind;
438   sljit_emit_op_custom(compiler, instruction, 5);
439   }
440 
441 OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
442 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
443 
444 load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
445 for (i = 0; i < 4; i++)
446   fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
447 
448 /* PMOVMSKB reg, xmm */
449 /* instruction[0] = 0x66; */
450 /* instruction[1] = 0x0f; */
451 instruction[2] = 0xd7;
452 instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
453 sljit_emit_op_custom(compiler, instruction, 4);
454 
455 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
456 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
457 
458 quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
459 
460 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
461 
462 /* Second part (aligned) */
463 start = LABEL();
464 
465 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
466 
467 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
468 
469 load_from_mem_sse2(compiler, data_ind, str_ptr_reg_ind, 0);
470 for (i = 0; i < 4; i++)
471   fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
472 
473 /* PMOVMSKB reg, xmm */
474 /* instruction[0] = 0x66; */
475 /* instruction[1] = 0x0f; */
476 instruction[2] = 0xd7;
477 instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | data_ind;
478 sljit_emit_op_custom(compiler, instruction, 4);
479 
480 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
481 
482 JUMPHERE(quit);
483 
484 /* BSF r32, r/m32 */
485 instruction[0] = 0x0f;
486 instruction[1] = 0xbc;
487 instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
488 sljit_emit_op_custom(compiler, instruction, 3);
489 
490 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
491 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
492 
493 OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
494 return not_found;
495 }
496 
497 #ifndef _WIN64
498 
499 #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SSE2))
500 
501 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
502   PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
503 {
504 DEFINE_COMPILER;
505 sljit_u8 instruction[8];
506 vector_compare_type compare1_type = vector_compare_match1;
507 vector_compare_type compare2_type = vector_compare_match1;
508 sljit_u32 bit1 = 0;
509 sljit_u32 bit2 = 0;
510 sljit_u32 diff = IN_UCHARS(offs1 - offs2);
511 sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
512 sljit_s32 tmp2_reg_ind = sljit_get_register_index(TMP2);
513 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
514 sljit_s32 data1_ind = 0;
515 sljit_s32 data2_ind = 1;
516 sljit_s32 tmp1_ind = 2;
517 sljit_s32 tmp2_ind = 3;
518 sljit_s32 cmp1a_ind = 4;
519 sljit_s32 cmp1b_ind = 5;
520 sljit_s32 cmp2a_ind = 6;
521 sljit_s32 cmp2b_ind = 7;
522 struct sljit_label *start;
523 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
524 struct sljit_label *restart;
525 #endif
526 struct sljit_jump *jump[2];
527 int i;
528 
529 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
530 SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));
531 SLJIT_ASSERT(tmp1_reg_ind < 8 && tmp2_reg_ind == 1);
532 
533 /* Initialize. */
534 if (common->match_end_ptr != 0)
535   {
536   OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
537   OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
538   OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
539 
540   OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, STR_END, 0);
541   CMOV(SLJIT_LESS, STR_END, TMP1, 0);
542   }
543 
544 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
545 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
546 
547 /* MOVD xmm, r/m32 */
548 instruction[0] = 0x66;
549 instruction[1] = 0x0f;
550 instruction[2] = 0x6e;
551 
552 if (char1a == char1b)
553   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
554 else
555   {
556   bit1 = char1a ^ char1b;
557   if (is_powerof2(bit1))
558     {
559     compare1_type = vector_compare_match1i;
560     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a | bit1));
561     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit1));
562     }
563   else
564     {
565     compare1_type = vector_compare_match2;
566     bit1 = 0;
567     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
568     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char1b));
569     }
570   }
571 
572 instruction[3] = 0xc0 | (cmp1a_ind << 3) | tmp1_reg_ind;
573 sljit_emit_op_custom(compiler, instruction, 4);
574 
575 if (char1a != char1b)
576   {
577   instruction[3] = 0xc0 | (cmp1b_ind << 3) | tmp2_reg_ind;
578   sljit_emit_op_custom(compiler, instruction, 4);
579   }
580 
581 if (char2a == char2b)
582   OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
583 else
584   {
585   bit2 = char2a ^ char2b;
586   if (is_powerof2(bit2))
587     {
588     compare2_type = vector_compare_match1i;
589     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a | bit2));
590     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit2));
591     }
592   else
593     {
594     compare2_type = vector_compare_match2;
595     bit2 = 0;
596     OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
597     OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char2b));
598     }
599   }
600 
601 instruction[3] = 0xc0 | (cmp2a_ind << 3) | tmp1_reg_ind;
602 sljit_emit_op_custom(compiler, instruction, 4);
603 
604 if (char2a != char2b)
605   {
606   instruction[3] = 0xc0 | (cmp2b_ind << 3) | tmp2_reg_ind;
607   sljit_emit_op_custom(compiler, instruction, 4);
608   }
609 
610 /* PSHUFD xmm1, xmm2/m128, imm8 */
611 /* instruction[0] = 0x66; */
612 /* instruction[1] = 0x0f; */
613 instruction[2] = 0x70;
614 instruction[4] = 0;
615 
616 instruction[3] = 0xc0 | (cmp1a_ind << 3) | cmp1a_ind;
617 sljit_emit_op_custom(compiler, instruction, 5);
618 
619 if (char1a != char1b)
620   {
621   instruction[3] = 0xc0 | (cmp1b_ind << 3) | cmp1b_ind;
622   sljit_emit_op_custom(compiler, instruction, 5);
623   }
624 
625 instruction[3] = 0xc0 | (cmp2a_ind << 3) | cmp2a_ind;
626 sljit_emit_op_custom(compiler, instruction, 5);
627 
628 if (char2a != char2b)
629   {
630   instruction[3] = 0xc0 | (cmp2b_ind << 3) | cmp2b_ind;
631   sljit_emit_op_custom(compiler, instruction, 5);
632   }
633 
634 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
635 restart = LABEL();
636 #endif
637 
638 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
639 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
640 OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~0xf);
641 
642 load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind, 0);
643 
644 jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
645 
646 load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind, -(sljit_s8)diff);
647 jump[1] = JUMP(SLJIT_JUMP);
648 
649 JUMPHERE(jump[0]);
650 
651 /* MOVDQA xmm1, xmm2/m128 */
652 /* instruction[0] = 0x66; */
653 /* instruction[1] = 0x0f; */
654 instruction[2] = 0x6f;
655 instruction[3] = 0xc0 | (data2_ind << 3) | data1_ind;
656 sljit_emit_op_custom(compiler, instruction, 4);
657 
658 /* PSLLDQ xmm1, imm8 */
659 /* instruction[0] = 0x66; */
660 /* instruction[1] = 0x0f; */
661 instruction[2] = 0x73;
662 instruction[3] = 0xc0 | (7 << 3) | data2_ind;
663 instruction[4] = diff;
664 sljit_emit_op_custom(compiler, instruction, 5);
665 
666 JUMPHERE(jump[1]);
667 
668 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
669 
670 for (i = 0; i < 4; i++)
671   {
672   fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
673   fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
674   }
675 
676 /* PAND xmm1, xmm2/m128 */
677 /* instruction[0] = 0x66; */
678 /* instruction[1] = 0x0f; */
679 instruction[2] = 0xdb;
680 instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind;
681 sljit_emit_op_custom(compiler, instruction, 4);
682 
683 /* PMOVMSKB reg, xmm */
684 /* instruction[0] = 0x66; */
685 /* instruction[1] = 0x0f; */
686 instruction[2] = 0xd7;
687 instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | 0;
688 sljit_emit_op_custom(compiler, instruction, 4);
689 
690 /* Ignore matches before the first STR_PTR. */
691 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
692 OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
693 
694 jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
695 
696 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
697 
698 /* Main loop. */
699 start = LABEL();
700 
701 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
702 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
703 
704 load_from_mem_sse2(compiler, data1_ind, str_ptr_reg_ind, 0);
705 load_from_mem_sse2(compiler, data2_ind, str_ptr_reg_ind, -(sljit_s8)diff);
706 
707 for (i = 0; i < 4; i++)
708   {
709   fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
710   fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
711   }
712 
713 /* PAND xmm1, xmm2/m128 */
714 /* instruction[0] = 0x66; */
715 /* instruction[1] = 0x0f; */
716 instruction[2] = 0xdb;
717 instruction[3] = 0xc0 | (data1_ind << 3) | data2_ind;
718 sljit_emit_op_custom(compiler, instruction, 4);
719 
720 /* PMOVMSKB reg, xmm */
721 /* instruction[0] = 0x66; */
722 /* instruction[1] = 0x0f; */
723 instruction[2] = 0xd7;
724 instruction[3] = 0xc0 | (tmp1_reg_ind << 3) | 0;
725 sljit_emit_op_custom(compiler, instruction, 4);
726 
727 CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
728 
729 JUMPHERE(jump[0]);
730 
731 /* BSF r32, r/m32 */
732 instruction[0] = 0x0f;
733 instruction[1] = 0xbc;
734 instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
735 sljit_emit_op_custom(compiler, instruction, 3);
736 
737 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
738 
739 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
740 
741 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
742 if (common->utf)
743   {
744   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
745 
746   jump[0] = jump_if_utf_char_start(compiler, TMP1);
747 
748   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
749   CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
750 
751   add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
752 
753   JUMPHERE(jump[0]);
754   }
755 #endif
756 
757 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
758 
759 if (common->match_end_ptr != 0)
760   OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
761 }
762 
763 #endif /* !_WIN64 */
764 
765 #undef SSE2_COMPARE_TYPE_INDEX
766 
767 #endif /* SLJIT_CONFIG_X86 */
768 
769 #if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 && (defined __ARM_NEON || defined __ARM_NEON__))
770 
771 #include <arm_neon.h>
772 
773 typedef union {
774   unsigned int x;
775   struct { unsigned char c1, c2, c3, c4; } c;
776 } int_char;
777 
778 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
779 static SLJIT_INLINE int utf_continue(sljit_u8 *s)
780 {
781 #if PCRE2_CODE_UNIT_WIDTH == 8
782 return (*s & 0xc0) == 0x80;
783 #elif PCRE2_CODE_UNIT_WIDTH == 16
784 return (*s & 0xfc00) == 0xdc00;
785 #else
786 #error "Unknown code width"
787 #endif
788 }
789 #endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
790 
791 #if PCRE2_CODE_UNIT_WIDTH == 8
792 # define VECTOR_FACTOR 16
793 # define vect_t uint8x16_t
794 # define VLD1Q(X) vld1q_u8((sljit_u8 *)(X))
795 # define VCEQQ vceqq_u8
796 # define VORRQ vorrq_u8
797 # define VST1Q vst1q_u8
798 # define VDUPQ vdupq_n_u8
799 # define VEXTQ vextq_u8
800 # define VANDQ vandq_u8
801 typedef union {
802        uint8_t mem[16];
803        uint64_t dw[2];
804 } quad_word;
805 #elif PCRE2_CODE_UNIT_WIDTH == 16
806 # define VECTOR_FACTOR 8
807 # define vect_t uint16x8_t
808 # define VLD1Q(X) vld1q_u16((sljit_u16 *)(X))
809 # define VCEQQ vceqq_u16
810 # define VORRQ vorrq_u16
811 # define VST1Q vst1q_u16
812 # define VDUPQ vdupq_n_u16
813 # define VEXTQ vextq_u16
814 # define VANDQ vandq_u16
815 typedef union {
816        uint16_t mem[8];
817        uint64_t dw[2];
818 } quad_word;
819 #else
820 # define VECTOR_FACTOR 4
821 # define vect_t uint32x4_t
822 # define VLD1Q(X) vld1q_u32((sljit_u32 *)(X))
823 # define VCEQQ vceqq_u32
824 # define VORRQ vorrq_u32
825 # define VST1Q vst1q_u32
826 # define VDUPQ vdupq_n_u32
827 # define VEXTQ vextq_u32
828 # define VANDQ vandq_u32
829 typedef union {
830        uint32_t mem[4];
831        uint64_t dw[2];
832 } quad_word;
833 #endif
834 
835 #define FFCS
836 #include "pcre2_jit_neon_inc.h"
837 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
838 # define FF_UTF
839 # include "pcre2_jit_neon_inc.h"
840 # undef FF_UTF
841 #endif
842 #undef FFCS
843 
844 #define FFCS_2
845 #include "pcre2_jit_neon_inc.h"
846 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
847 # define FF_UTF
848 # include "pcre2_jit_neon_inc.h"
849 # undef FF_UTF
850 #endif
851 #undef FFCS_2
852 
853 #define FFCS_MASK
854 #include "pcre2_jit_neon_inc.h"
855 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
856 # define FF_UTF
857 # include "pcre2_jit_neon_inc.h"
858 # undef FF_UTF
859 #endif
860 #undef FFCS_MASK
861 
862 #define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
863 
864 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
865 {
866 DEFINE_COMPILER;
867 int_char ic;
868 struct sljit_jump *partial_quit;
869 /* Save temporary registers. */
870 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
871 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP3, 0);
872 
873 /* Prepare function arguments */
874 OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
875 OP1(SLJIT_MOV, SLJIT_R1, 0, STR_PTR, 0);
876 OP1(SLJIT_MOV, SLJIT_R2, 0, SLJIT_IMM, offset);
877 
878 if (char1 == char2)
879   {
880     ic.c.c1 = char1;
881     ic.c.c2 = char2;
882     OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
883 
884 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
885   if (common->utf && offset > 0)
886     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
887                      SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_utf));
888   else
889     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
890                      SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs));
891 #else
892   sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
893                    SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs));
894 #endif
895   }
896 else
897   {
898   PCRE2_UCHAR mask = char1 ^ char2;
899   if (is_powerof2(mask))
900     {
901     ic.c.c1 = char1 | mask;
902     ic.c.c2 = mask;
903     OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
904 
905 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
906     if (common->utf && offset > 0)
907       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
908                        SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask_utf));
909     else
910       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
911                        SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask));
912 #else
913     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
914                      SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_mask));
915 #endif
916     }
917   else
918     {
919       ic.c.c1 = char1;
920       ic.c.c2 = char2;
921       OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
922 
923 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
924     if (common->utf && offset > 0)
925       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
926                        SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2_utf));
927     else
928       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
929                        SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2));
930 #else
931     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(UW) | SLJIT_ARG3(UW) | SLJIT_ARG4(UW),
932                      SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcs_2));
933 #endif
934     }
935   }
936 /* Restore registers. */
937 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
938 OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1);
939 
940 /* Check return value. */
941 partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
942 if (common->mode == PCRE2_JIT_COMPLETE)
943   add_jump(compiler, &common->failed_match, partial_quit);
944 
945 /* Fast forward STR_PTR to the result of memchr. */
946 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
947 
948 if (common->mode != PCRE2_JIT_COMPLETE)
949   JUMPHERE(partial_quit);
950 }
951 
952 typedef enum {
953   compare_match1,
954   compare_match1i,
955   compare_match2,
956 } compare_type;
957 
958 static inline vect_t fast_forward_char_pair_compare(compare_type ctype, vect_t dst, vect_t cmp1, vect_t cmp2)
959 {
960 if (ctype == compare_match2)
961   {
962   vect_t tmp = dst;
963   dst = VCEQQ(dst, cmp1);
964   tmp = VCEQQ(tmp, cmp2);
965   dst = VORRQ(dst, tmp);
966   return dst;
967   }
968 
969 if (ctype == compare_match1i)
970   dst = VORRQ(dst, cmp2);
971 dst = VCEQQ(dst, cmp1);
972 return dst;
973 }
974 
975 static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void)
976 {
977 #if PCRE2_CODE_UNIT_WIDTH == 8
978 return 15;
979 #elif PCRE2_CODE_UNIT_WIDTH == 16
980 return 7;
981 #elif PCRE2_CODE_UNIT_WIDTH == 32
982 return 3;
983 #else
984 #error "Unsupported unit width"
985 #endif
986 }
987 
988 /* ARM doesn't have a shift left across lanes. */
989 static SLJIT_INLINE vect_t shift_left_n_lanes(vect_t a, sljit_u8 n)
990 {
991 vect_t zero = VDUPQ(0);
992 SLJIT_ASSERT(0 < n && n < VECTOR_FACTOR);
993 /* VEXTQ takes an immediate as last argument. */
994 #define C(X) case X: return VEXTQ(zero, a, VECTOR_FACTOR - X);
995 switch (n)
996   {
997   C(1); C(2); C(3);
998 #if PCRE2_CODE_UNIT_WIDTH != 32
999   C(4); C(5); C(6); C(7);
1000 # if PCRE2_CODE_UNIT_WIDTH != 16
1001   C(8); C(9); C(10); C(11); C(12); C(13); C(14); C(15);
1002 # endif
1003 #endif
1004   default:
1005     /* Based on the ASSERT(0 < n && n < VECTOR_FACTOR) above, this won't
1006        happen. The return is still here for compilers to not warn. */
1007     return a;
1008   }
1009 }
1010 
1011 #define FFCPS
1012 #define FFCPS_DIFF1
1013 #define FFCPS_CHAR1A2A
1014 
1015 #define FFCPS_0
1016 #include "pcre2_jit_neon_inc.h"
1017 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1018 # define FF_UTF
1019 # include "pcre2_jit_neon_inc.h"
1020 # undef FF_UTF
1021 #endif
1022 #undef FFCPS_0
1023 
1024 #undef FFCPS_CHAR1A2A
1025 
1026 #define FFCPS_1
1027 #include "pcre2_jit_neon_inc.h"
1028 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1029 # define FF_UTF
1030 # include "pcre2_jit_neon_inc.h"
1031 # undef FF_UTF
1032 #endif
1033 #undef FFCPS_1
1034 
1035 #undef FFCPS_DIFF1
1036 
1037 #define FFCPS_DEFAULT
1038 #include "pcre2_jit_neon_inc.h"
1039 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1040 # define FF_UTF
1041 # include "pcre2_jit_neon_inc.h"
1042 # undef FF_UTF
1043 #endif
1044 #undef FFCPS
1045 
1046 #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1047 
1048 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1049   PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1050 {
1051 DEFINE_COMPILER;
1052 sljit_u32 diff = IN_UCHARS(offs1 - offs2);
1053 struct sljit_jump *partial_quit;
1054 int_char ic;
1055 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1056 SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));
1057 SLJIT_ASSERT(compiler->scratches == 5);
1058 
1059 /* Save temporary register STR_PTR. */
1060 OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
1061 
1062 /* Prepare arguments for the function call. */
1063 if (common->match_end_ptr == 0)
1064    OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
1065 else
1066   {
1067   OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1068   OP2(SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1069 
1070   OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, STR_END, 0, SLJIT_R0, 0);
1071   CMOV(SLJIT_LESS, SLJIT_R0, STR_END, 0);
1072   }
1073 
1074 OP1(SLJIT_MOV, SLJIT_R1, 0, STR_PTR, 0);
1075 OP1(SLJIT_MOV_S32, SLJIT_R2, 0, SLJIT_IMM, offs1);
1076 OP1(SLJIT_MOV_S32, SLJIT_R3, 0, SLJIT_IMM, offs2);
1077 ic.c.c1 = char1a;
1078 ic.c.c2 = char1b;
1079 ic.c.c3 = char2a;
1080 ic.c.c4 = char2b;
1081 OP1(SLJIT_MOV_U32, SLJIT_R4, 0, SLJIT_IMM, ic.x);
1082 
1083 if (diff == 1) {
1084   if (char1a == char1b && char2a == char2b) {
1085 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1086     if (common->utf)
1087       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
1088                        SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_0_utf));
1089     else
1090 #endif
1091       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
1092                        SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_0));
1093   } else {
1094 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1095     if (common->utf)
1096       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
1097                        SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_1_utf));
1098     else
1099 #endif
1100       sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
1101                        SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_1));
1102   }
1103 } else {
1104 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1105   if (common->utf)
1106     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
1107                      SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_default_utf));
1108   else
1109 #endif
1110     sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_RET(SW) | SLJIT_ARG1(SW) | SLJIT_ARG2(SW) | SLJIT_ARG3(SW) | SLJIT_ARG4(SW),
1111                      SLJIT_IMM, SLJIT_FUNC_OFFSET(ffcps_default));
1112 }
1113 
1114 /* Restore STR_PTR register. */
1115 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
1116 
1117 /* Check return value. */
1118 partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
1119 add_jump(compiler, &common->failed_match, partial_quit);
1120 
1121 /* Fast forward STR_PTR to the result of memchr. */
1122 OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
1123 
1124 JUMPHERE(partial_quit);
1125 }
1126 
1127 #endif /* SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 */
1128 
1129 #if (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
1130 
1131 #if PCRE2_CODE_UNIT_WIDTH == 8
1132 #define VECTOR_ELEMENT_SIZE 0
1133 #elif PCRE2_CODE_UNIT_WIDTH == 16
1134 #define VECTOR_ELEMENT_SIZE 1
1135 #elif PCRE2_CODE_UNIT_WIDTH == 32
1136 #define VECTOR_ELEMENT_SIZE 2
1137 #else
1138 #error "Unsupported unit width"
1139 #endif
1140 
1141 static void load_from_mem_vector(struct sljit_compiler *compiler, BOOL vlbb, sljit_s32 dst_vreg,
1142   sljit_s32 base_reg, sljit_s32 index_reg)
1143 {
1144 sljit_u16 instruction[3];
1145 
1146 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | index_reg);
1147 instruction[1] = (sljit_u16)(base_reg << 12);
1148 instruction[2] = (sljit_u16)((0x8 << 8) | (vlbb ? 0x07 : 0x06));
1149 
1150 sljit_emit_op_custom(compiler, instruction, 6);
1151 }
1152 
1153 #if PCRE2_CODE_UNIT_WIDTH == 32
1154 
1155 static void replicate_imm_vector(struct sljit_compiler *compiler, int step, sljit_s32 dst_vreg,
1156   PCRE2_UCHAR chr, sljit_s32 tmp_general_reg)
1157 {
1158 sljit_u16 instruction[3];
1159 
1160 SLJIT_ASSERT(step >= 0 && step <= 1);
1161 
1162 if (chr < 0x7fff)
1163   {
1164   if (step == 1)
1165     return;
1166 
1167   /* VREPI */
1168   instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4));
1169   instruction[1] = (sljit_u16)chr;
1170   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1171   sljit_emit_op_custom(compiler, instruction, 6);
1172   return;
1173   }
1174 
1175 if (step == 0)
1176   {
1177   OP1(SLJIT_MOV, tmp_general_reg, 0, SLJIT_IMM, chr);
1178 
1179   /* VLVG */
1180   instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(tmp_general_reg));
1181   instruction[1] = 0;
1182   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x22);
1183   sljit_emit_op_custom(compiler, instruction, 6);
1184   return;
1185   }
1186 
1187 /* VREP */
1188 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | dst_vreg);
1189 instruction[1] = 0;
1190 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xc << 8) | 0x4d);
1191 sljit_emit_op_custom(compiler, instruction, 6);
1192 }
1193 
1194 #endif
1195 
1196 static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
1197   int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
1198 {
1199 sljit_u16 instruction[3];
1200 
1201 SLJIT_ASSERT(step >= 0 && step <= 2);
1202 
1203 if (step == 1)
1204   {
1205   /* VCEQ */
1206   instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1207   instruction[1] = (sljit_u16)(cmp1_ind << 12);
1208   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1209   sljit_emit_op_custom(compiler, instruction, 6);
1210   return;
1211   }
1212 
1213 if (compare_type != vector_compare_match2)
1214   {
1215   if (step == 0 && compare_type == vector_compare_match1i)
1216     {
1217     /* VO */
1218     instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1219     instruction[1] = (sljit_u16)(cmp2_ind << 12);
1220     instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1221     sljit_emit_op_custom(compiler, instruction, 6);
1222     }
1223   return;
1224   }
1225 
1226 switch (step)
1227   {
1228   case 0:
1229   /* VCEQ */
1230   instruction[0] = (sljit_u16)(0xe700 | (tmp_ind << 4) | dst_ind);
1231   instruction[1] = (sljit_u16)(cmp2_ind << 12);
1232   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1233   sljit_emit_op_custom(compiler, instruction, 6);
1234   return;
1235 
1236   case 2:
1237   /* VO */
1238   instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1239   instruction[1] = (sljit_u16)(tmp_ind << 12);
1240   instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1241   sljit_emit_op_custom(compiler, instruction, 6);
1242   return;
1243   }
1244 }
1245 
1246 #define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
1247 
1248 static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
1249 {
1250 DEFINE_COMPILER;
1251 sljit_u16 instruction[3];
1252 struct sljit_label *start;
1253 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1254 struct sljit_label *restart;
1255 #endif
1256 struct sljit_jump *quit;
1257 struct sljit_jump *partial_quit[2];
1258 vector_compare_type compare_type = vector_compare_match1;
1259 sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
1260 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
1261 sljit_s32 data_ind = 0;
1262 sljit_s32 tmp_ind = 1;
1263 sljit_s32 cmp1_ind = 2;
1264 sljit_s32 cmp2_ind = 3;
1265 sljit_s32 zero_ind = 4;
1266 sljit_u32 bit = 0;
1267 int i;
1268 
1269 SLJIT_UNUSED_ARG(offset);
1270 
1271 if (char1 != char2)
1272   {
1273   bit = char1 ^ char2;
1274   compare_type = vector_compare_match1i;
1275 
1276   if (!is_powerof2(bit))
1277     {
1278     bit = 0;
1279     compare_type = vector_compare_match2;
1280     }
1281   }
1282 
1283 partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1284 if (common->mode == PCRE2_JIT_COMPLETE)
1285   add_jump(compiler, &common->failed_match, partial_quit[0]);
1286 
1287 /* First part (unaligned start) */
1288 
1289 OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1290 
1291 #if PCRE2_CODE_UNIT_WIDTH != 32
1292 
1293 /* VREPI */
1294 instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1295 instruction[1] = (sljit_u16)(char1 | bit);
1296 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1297 sljit_emit_op_custom(compiler, instruction, 6);
1298 
1299 if (char1 != char2)
1300   {
1301   /* VREPI */
1302   instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1303   instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1304   /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1305   sljit_emit_op_custom(compiler, instruction, 6);
1306   }
1307 
1308 #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1309 
1310 for (int i = 0; i < 2; i++)
1311   {
1312   replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP1);
1313 
1314   if (char1 != char2)
1315     replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP1);
1316   }
1317 
1318 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1319 
1320 if (compare_type == vector_compare_match2)
1321   {
1322   /* VREPI */
1323   instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1324   instruction[1] = 0;
1325   instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1326   sljit_emit_op_custom(compiler, instruction, 6);
1327   }
1328 
1329 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1330 restart = LABEL();
1331 #endif
1332 
1333 load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1334 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1335 
1336 if (compare_type != vector_compare_match2)
1337   {
1338   if (compare_type == vector_compare_match1i)
1339     fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1340 
1341   /* VFEE */
1342   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1343   instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1344   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1345   sljit_emit_op_custom(compiler, instruction, 6);
1346   }
1347 else
1348   {
1349   for (i = 0; i < 3; i++)
1350     fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1351 
1352   /* VFENE */
1353   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1354   instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1355   instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1356   sljit_emit_op_custom(compiler, instruction, 6);
1357   }
1358 
1359 /* VLGVB */
1360 instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1361 instruction[1] = 7;
1362 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1363 sljit_emit_op_custom(compiler, instruction, 6);
1364 
1365 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1366 quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1367 
1368 OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1369 
1370 /* Second part (aligned) */
1371 start = LABEL();
1372 
1373 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1374 
1375 partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1376 if (common->mode == PCRE2_JIT_COMPLETE)
1377   add_jump(compiler, &common->failed_match, partial_quit[1]);
1378 
1379 load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1380 
1381 if (compare_type != vector_compare_match2)
1382   {
1383   if (compare_type == vector_compare_match1i)
1384     fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1385 
1386   /* VFEE */
1387   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1388   instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1389   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1390   sljit_emit_op_custom(compiler, instruction, 6);
1391   }
1392 else
1393   {
1394   for (i = 0; i < 3; i++)
1395     fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1396 
1397   /* VFENE */
1398   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1399   instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1400   instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1401   sljit_emit_op_custom(compiler, instruction, 6);
1402   }
1403 
1404 sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1405 JUMPTO(SLJIT_OVERFLOW, start);
1406 
1407 /* VLGVB */
1408 instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1409 instruction[1] = 7;
1410 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1411 sljit_emit_op_custom(compiler, instruction, 6);
1412 
1413 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1414 
1415 JUMPHERE(quit);
1416 
1417 if (common->mode != PCRE2_JIT_COMPLETE)
1418   {
1419   JUMPHERE(partial_quit[0]);
1420   JUMPHERE(partial_quit[1]);
1421   OP2(SLJIT_SUB | SLJIT_SET_GREATER, SLJIT_UNUSED, 0, STR_PTR, 0, STR_END, 0);
1422   CMOV(SLJIT_GREATER, STR_PTR, STR_END, 0);
1423   }
1424 else
1425   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1426 
1427 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1428 if (common->utf && offset > 0)
1429   {
1430   SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1431 
1432   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
1433 
1434   quit = jump_if_utf_char_start(compiler, TMP1);
1435 
1436   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1437   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1438 
1439   OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1440   JUMPTO(SLJIT_JUMP, restart);
1441 
1442   JUMPHERE(quit);
1443   }
1444 #endif
1445 }
1446 
1447 #define JIT_HAS_FAST_REQUESTED_CHAR_SIMD 1
1448 
1449 static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
1450 {
1451 DEFINE_COMPILER;
1452 sljit_u16 instruction[3];
1453 struct sljit_label *start;
1454 struct sljit_jump *quit;
1455 jump_list *not_found = NULL;
1456 vector_compare_type compare_type = vector_compare_match1;
1457 sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
1458 sljit_s32 tmp3_reg_ind = sljit_get_register_index(TMP3);
1459 sljit_s32 data_ind = 0;
1460 sljit_s32 tmp_ind = 1;
1461 sljit_s32 cmp1_ind = 2;
1462 sljit_s32 cmp2_ind = 3;
1463 sljit_s32 zero_ind = 4;
1464 sljit_u32 bit = 0;
1465 int i;
1466 
1467 if (char1 != char2)
1468   {
1469   bit = char1 ^ char2;
1470   compare_type = vector_compare_match1i;
1471 
1472   if (!is_powerof2(bit))
1473     {
1474     bit = 0;
1475     compare_type = vector_compare_match2;
1476     }
1477   }
1478 
1479 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1480 
1481 /* First part (unaligned start) */
1482 
1483 OP2(SLJIT_ADD, TMP2, 0, TMP1, 0, SLJIT_IMM, 16);
1484 
1485 #if PCRE2_CODE_UNIT_WIDTH != 32
1486 
1487 /* VREPI */
1488 instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1489 instruction[1] = (sljit_u16)(char1 | bit);
1490 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1491 sljit_emit_op_custom(compiler, instruction, 6);
1492 
1493 if (char1 != char2)
1494   {
1495   /* VREPI */
1496   instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1497   instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1498   /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1499   sljit_emit_op_custom(compiler, instruction, 6);
1500   }
1501 
1502 #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1503 
1504 for (int i = 0; i < 2; i++)
1505   {
1506   replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP3);
1507 
1508   if (char1 != char2)
1509     replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP3);
1510   }
1511 
1512 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1513 
1514 if (compare_type == vector_compare_match2)
1515   {
1516   /* VREPI */
1517   instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1518   instruction[1] = 0;
1519   instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1520   sljit_emit_op_custom(compiler, instruction, 6);
1521   }
1522 
1523 load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1524 OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1525 
1526 if (compare_type != vector_compare_match2)
1527   {
1528   if (compare_type == vector_compare_match1i)
1529     fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1530 
1531   /* VFEE */
1532   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1533   instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1534   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1535   sljit_emit_op_custom(compiler, instruction, 6);
1536   }
1537 else
1538   {
1539   for (i = 0; i < 3; i++)
1540     fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1541 
1542   /* VFENE */
1543   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1544   instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1545   instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1546   sljit_emit_op_custom(compiler, instruction, 6);
1547   }
1548 
1549 /* VLGVB */
1550 instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1551 instruction[1] = 7;
1552 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1553 sljit_emit_op_custom(compiler, instruction, 6);
1554 
1555 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1556 quit = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1557 
1558 OP2(SLJIT_SUB, TMP1, 0, TMP2, 0, SLJIT_IMM, 16);
1559 
1560 /* Second part (aligned) */
1561 start = LABEL();
1562 
1563 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 16);
1564 
1565 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1566 
1567 load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1568 
1569 if (compare_type != vector_compare_match2)
1570   {
1571   if (compare_type == vector_compare_match1i)
1572     fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1573 
1574   /* VFEE */
1575   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1576   instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1577   instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1578   sljit_emit_op_custom(compiler, instruction, 6);
1579   }
1580 else
1581   {
1582   for (i = 0; i < 3; i++)
1583     fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1584 
1585   /* VFENE */
1586   instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1587   instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1588   instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1589   sljit_emit_op_custom(compiler, instruction, 6);
1590   }
1591 
1592 sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1593 JUMPTO(SLJIT_OVERFLOW, start);
1594 
1595 /* VLGVB */
1596 instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1597 instruction[1] = 7;
1598 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1599 sljit_emit_op_custom(compiler, instruction, 6);
1600 
1601 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1602 
1603 JUMPHERE(quit);
1604 add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1605 
1606 return not_found;
1607 }
1608 
1609 #define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1610 
1611 static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1612   PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1613 {
1614 DEFINE_COMPILER;
1615 sljit_u16 instruction[3];
1616 struct sljit_label *start;
1617 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1618 struct sljit_label *restart;
1619 #endif
1620 struct sljit_jump *quit;
1621 struct sljit_jump *jump[2];
1622 vector_compare_type compare1_type = vector_compare_match1;
1623 vector_compare_type compare2_type = vector_compare_match1;
1624 sljit_u32 bit1 = 0;
1625 sljit_u32 bit2 = 0;
1626 sljit_s32 diff = IN_UCHARS(offs2 - offs1);
1627 sljit_s32 tmp1_reg_ind = sljit_get_register_index(TMP1);
1628 sljit_s32 tmp2_reg_ind = sljit_get_register_index(TMP2);
1629 sljit_s32 str_ptr_reg_ind = sljit_get_register_index(STR_PTR);
1630 sljit_s32 data1_ind = 0;
1631 sljit_s32 data2_ind = 1;
1632 sljit_s32 tmp1_ind = 2;
1633 sljit_s32 tmp2_ind = 3;
1634 sljit_s32 cmp1a_ind = 4;
1635 sljit_s32 cmp1b_ind = 5;
1636 sljit_s32 cmp2a_ind = 6;
1637 sljit_s32 cmp2b_ind = 7;
1638 sljit_s32 zero_ind = 8;
1639 int i;
1640 
1641 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1642 SLJIT_ASSERT(-diff <= (sljit_s32)IN_UCHARS(max_fast_forward_char_pair_offset()));
1643 SLJIT_ASSERT(tmp1_reg_ind != 0 && tmp2_reg_ind != 0);
1644 
1645 if (char1a != char1b)
1646   {
1647   bit1 = char1a ^ char1b;
1648   compare1_type = vector_compare_match1i;
1649 
1650   if (!is_powerof2(bit1))
1651     {
1652     bit1 = 0;
1653     compare1_type = vector_compare_match2;
1654     }
1655   }
1656 
1657 if (char2a != char2b)
1658   {
1659   bit2 = char2a ^ char2b;
1660   compare2_type = vector_compare_match1i;
1661 
1662   if (!is_powerof2(bit2))
1663     {
1664     bit2 = 0;
1665     compare2_type = vector_compare_match2;
1666     }
1667   }
1668 
1669 /* Initialize. */
1670 if (common->match_end_ptr != 0)
1671   {
1672   OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1673   OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
1674   OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1675 
1676   OP2(SLJIT_SUB | SLJIT_SET_LESS, SLJIT_UNUSED, 0, TMP1, 0, STR_END, 0);
1677   CMOV(SLJIT_LESS, STR_END, TMP1, 0);
1678   }
1679 
1680 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1681 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1682 OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1683 
1684 #if PCRE2_CODE_UNIT_WIDTH != 32
1685 
1686 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1687 
1688 /* VREPI */
1689 instruction[0] = (sljit_u16)(0xe700 | (cmp1a_ind << 4));
1690 instruction[1] = (sljit_u16)(char1a | bit1);
1691 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1692 sljit_emit_op_custom(compiler, instruction, 6);
1693 
1694 if (char1a != char1b)
1695   {
1696   /* VREPI */
1697   instruction[0] = (sljit_u16)(0xe700 | (cmp1b_ind << 4));
1698   instruction[1] = (sljit_u16)(bit1 != 0 ? bit1 : char1b);
1699   /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1700   sljit_emit_op_custom(compiler, instruction, 6);
1701   }
1702 
1703 /* VREPI */
1704 instruction[0] = (sljit_u16)(0xe700 | (cmp2a_ind << 4));
1705 instruction[1] = (sljit_u16)(char2a | bit2);
1706 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1707 sljit_emit_op_custom(compiler, instruction, 6);
1708 
1709 if (char2a != char2b)
1710   {
1711   /* VREPI */
1712   instruction[0] = (sljit_u16)(0xe700 | (cmp2b_ind << 4));
1713   instruction[1] = (sljit_u16)(bit2 != 0 ? bit2 : char2b);
1714   /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1715   sljit_emit_op_custom(compiler, instruction, 6);
1716   }
1717 
1718 #else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1719 
1720 for (int i = 0; i < 2; i++)
1721   {
1722   replicate_imm_vector(compiler, i, cmp1a_ind, char1a | bit1, TMP1);
1723 
1724   if (char1a != char1b)
1725     replicate_imm_vector(compiler, i, cmp1b_ind, bit1 != 0 ? bit1 : char1b, TMP1);
1726 
1727   replicate_imm_vector(compiler, i, cmp2a_ind, char2a | bit2, TMP1);
1728 
1729   if (char2a != char2b)
1730     replicate_imm_vector(compiler, i, cmp2b_ind, bit2 != 0 ? bit2 : char2b, TMP1);
1731   }
1732 
1733 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1734 
1735 #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1736 
1737 /* VREPI */
1738 instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1739 instruction[1] = 0;
1740 instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1741 sljit_emit_op_custom(compiler, instruction, 6);
1742 
1743 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1744 restart = LABEL();
1745 #endif
1746 
1747 jump[0] = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1748 load_from_mem_vector(compiler, TRUE, data2_ind, tmp1_reg_ind, 0);
1749 jump[1] = JUMP(SLJIT_JUMP);
1750 JUMPHERE(jump[0]);
1751 load_from_mem_vector(compiler, FALSE, data2_ind, tmp1_reg_ind, 0);
1752 JUMPHERE(jump[1]);
1753 
1754 load_from_mem_vector(compiler, TRUE, data1_ind, str_ptr_reg_ind, 0);
1755 OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 16);
1756 
1757 for (i = 0; i < 3; i++)
1758   {
1759   fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1760   fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1761   }
1762 
1763 /* VN */
1764 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1765 instruction[1] = (sljit_u16)(data2_ind << 12);
1766 instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1767 sljit_emit_op_custom(compiler, instruction, 6);
1768 
1769 /* VFENE */
1770 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1771 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1772 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1773 sljit_emit_op_custom(compiler, instruction, 6);
1774 
1775 /* VLGVB */
1776 instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data1_ind);
1777 instruction[1] = 7;
1778 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1779 sljit_emit_op_custom(compiler, instruction, 6);
1780 
1781 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1782 quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1783 
1784 OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1785 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, diff);
1786 
1787 /* Main loop. */
1788 start = LABEL();
1789 
1790 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1791 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1792 
1793 load_from_mem_vector(compiler, FALSE, data1_ind, str_ptr_reg_ind, 0);
1794 load_from_mem_vector(compiler, FALSE, data2_ind, str_ptr_reg_ind, tmp1_reg_ind);
1795 
1796 for (i = 0; i < 3; i++)
1797   {
1798   fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1799   fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1800   }
1801 
1802 /* VN */
1803 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1804 instruction[1] = (sljit_u16)(data2_ind << 12);
1805 instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1806 sljit_emit_op_custom(compiler, instruction, 6);
1807 
1808 /* VFENE */
1809 instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1810 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1811 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1812 sljit_emit_op_custom(compiler, instruction, 6);
1813 
1814 sljit_set_current_flags(compiler, SLJIT_SET_OVERFLOW);
1815 JUMPTO(SLJIT_OVERFLOW, start);
1816 
1817 /* VLGVB */
1818 instruction[0] = (sljit_u16)(0xe700 | (tmp2_reg_ind << 4) | data1_ind);
1819 instruction[1] = 7;
1820 instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1821 sljit_emit_op_custom(compiler, instruction, 6);
1822 
1823 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1824 
1825 JUMPHERE(quit);
1826 
1827 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1828 
1829 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1830 if (common->utf)
1831   {
1832   SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1833 
1834   OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
1835 
1836   quit = jump_if_utf_char_start(compiler, TMP1);
1837 
1838   OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1839   add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1840 
1841   /* TMP1 contains diff. */
1842   OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1843   OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1844   JUMPTO(SLJIT_JUMP, restart);
1845 
1846   JUMPHERE(quit);
1847   }
1848 #endif
1849 
1850 OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1851 
1852 if (common->match_end_ptr != 0)
1853   OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
1854 }
1855 
1856 #endif /* SLJIT_CONFIG_S390X */
1857 
1858 #endif /* !SUPPORT_VALGRIND */
1859