1 /*
2 Copyright (c) 2013. The YARA Authors. All Rights Reserved.
3
4 Redistribution and use in source and binary forms, with or without modification,
5 are permitted provided that the following conditions are met:
6
7 1. Redistributions of source code must retain the above copyright notice, this
8 list of conditions and the following disclaimer.
9
10 2. Redistributions in binary form must reproduce the above copyright notice,
11 this list of conditions and the following disclaimer in the documentation and/or
12 other materials provided with the distribution.
13
14 3. Neither the name of the copyright holder nor the names of its contributors
15 may be used to endorse or promote products derived from this software without
16 specific prior written permission.
17
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 /*
31
32 This module implements a regular expressions engine based on Thompson's
33 algorithm as described by Russ Cox in http://swtch.com/~rsc/regexp/regexp2.html.
34
35 What the article names a "thread" has been named a "fiber" in this code, in
36 order to avoid confusion with operating system threads.
37
38 */
39
40 #include <assert.h>
41 #include <string.h>
42 #include <yara/compiler.h>
43 #include <yara/error.h>
44 #include <yara/globals.h>
45 #include <yara/hex_lexer.h>
46 #include <yara/limits.h>
47 #include <yara/mem.h>
48 #include <yara/re.h>
49 #include <yara/re_lexer.h>
50 #include <yara/threading.h>
51 #include <yara/utils.h>
52
53 #define EMIT_BACKWARDS 0x01
54 #define EMIT_DONT_SET_FORWARDS_CODE 0x02
55 #define EMIT_DONT_SET_BACKWARDS_CODE 0x04
56
57 #ifndef INT16_MAX
58 #define INT16_MAX (32767)
59 #endif
60
61 typedef uint8_t RE_SPLIT_ID_TYPE;
62
63 typedef struct _RE_REPEAT_ARGS
64 {
65 uint16_t min;
66 uint16_t max;
67 int32_t offset;
68
69 } RE_REPEAT_ARGS;
70
71 typedef struct _RE_REPEAT_ANY_ARGS
72 {
73 uint16_t min;
74 uint16_t max;
75
76 } RE_REPEAT_ANY_ARGS;
77
78 typedef struct _RE_EMIT_CONTEXT
79 {
80 YR_ARENA* arena;
81 RE_SPLIT_ID_TYPE next_split_id;
82
83 } RE_EMIT_CONTEXT;
84
85 #define CHAR_IN_CLASS(cls, chr) ((cls)[(chr) / 8] & 1 << ((chr) % 8))
86
_yr_re_is_char_in_class(RE_CLASS * re_class,uint8_t chr,int case_insensitive)87 static bool _yr_re_is_char_in_class(
88 RE_CLASS* re_class,
89 uint8_t chr,
90 int case_insensitive)
91 {
92 int result = CHAR_IN_CLASS(re_class->bitmap, chr);
93
94 if (case_insensitive)
95 result |= CHAR_IN_CLASS(re_class->bitmap, yr_altercase[chr]);
96
97 if (re_class->negated)
98 result = !result;
99
100 return result;
101 }
102
_yr_re_is_word_char(const uint8_t * input,uint8_t character_size)103 static bool _yr_re_is_word_char(const uint8_t* input, uint8_t character_size)
104 {
105 int result = ((isalnum(*input) || (*input) == '_'));
106
107 if (character_size == 2)
108 result = result && (*(input + 1) == 0);
109
110 return result;
111 }
112
yr_re_node_create(int type)113 RE_NODE* yr_re_node_create(int type)
114 {
115 RE_NODE* result = (RE_NODE*) yr_malloc(sizeof(RE_NODE));
116
117 if (result != NULL)
118 {
119 result->type = type;
120 result->children_head = NULL;
121 result->children_tail = NULL;
122 result->prev_sibling = NULL;
123 result->next_sibling = NULL;
124 result->greedy = true;
125 result->forward_code_ref = YR_ARENA_NULL_REF;
126 result->backward_code_ref = YR_ARENA_NULL_REF;
127 }
128
129 return result;
130 }
131
yr_re_node_destroy(RE_NODE * node)132 void yr_re_node_destroy(RE_NODE* node)
133 {
134 RE_NODE* child = node->children_head;
135 RE_NODE* next_child;
136
137 while (child != NULL)
138 {
139 next_child = child->next_sibling;
140 yr_re_node_destroy(child);
141 child = next_child;
142 }
143
144 if (node->type == RE_NODE_CLASS)
145 yr_free(node->re_class);
146
147 yr_free(node);
148 }
149
150 ////////////////////////////////////////////////////////////////////////////////
151 // Appends a node to the end of the children list.
152 //
yr_re_node_append_child(RE_NODE * node,RE_NODE * child)153 void yr_re_node_append_child(RE_NODE* node, RE_NODE* child)
154 {
155 if (node->children_head == NULL)
156 node->children_head = child;
157
158 if (node->children_tail != NULL)
159 node->children_tail->next_sibling = child;
160
161 child->prev_sibling = node->children_tail;
162 node->children_tail = child;
163 }
164
165 ////////////////////////////////////////////////////////////////////////////////
166 // Appends a node to the beginning of the children list.
167 //
yr_re_node_prepend_child(RE_NODE * node,RE_NODE * child)168 void yr_re_node_prepend_child(RE_NODE* node, RE_NODE* child)
169 {
170 child->next_sibling = node->children_head;
171
172 if (node->children_head != NULL)
173 node->children_head->prev_sibling = child;
174
175 node->children_head = child;
176
177 if (node->children_tail == NULL)
178 node->children_tail = child;
179 }
180
yr_re_ast_create(RE_AST ** re_ast)181 int yr_re_ast_create(RE_AST** re_ast)
182 {
183 *re_ast = (RE_AST*) yr_malloc(sizeof(RE_AST));
184
185 if (*re_ast == NULL)
186 return ERROR_INSUFFICIENT_MEMORY;
187
188 (*re_ast)->flags = 0;
189 (*re_ast)->root_node = NULL;
190
191 return ERROR_SUCCESS;
192 }
193
yr_re_ast_destroy(RE_AST * re_ast)194 void yr_re_ast_destroy(RE_AST* re_ast)
195 {
196 if (re_ast->root_node != NULL)
197 yr_re_node_destroy(re_ast->root_node);
198
199 yr_free(re_ast);
200 }
201
202 ////////////////////////////////////////////////////////////////////////////////
203 // Parses a regexp but don't emit its code. A further call to
204 // yr_re_ast_emit_code is required to get the code.
205 //
yr_re_parse(const char * re_string,RE_AST ** re_ast,RE_ERROR * error)206 int yr_re_parse(const char* re_string, RE_AST** re_ast, RE_ERROR* error)
207 {
208 return yr_parse_re_string(re_string, re_ast, error);
209 }
210
211 ////////////////////////////////////////////////////////////////////////////////
212 // Parses a hex string but don't emit its code. A further call to
213 // yr_re_ast_emit_code is required to get the code.
214 //
yr_re_parse_hex(const char * hex_string,RE_AST ** re_ast,RE_ERROR * error)215 int yr_re_parse_hex(const char* hex_string, RE_AST** re_ast, RE_ERROR* error)
216 {
217 return yr_parse_hex_string(hex_string, re_ast, error);
218 }
219
220 ////////////////////////////////////////////////////////////////////////////////
221 // Parses the regexp and emit its code to the provided to the
222 // YR_RE_CODE_SECTION in the specified arena.
223 //
yr_re_compile(const char * re_string,int flags,YR_ARENA * arena,YR_ARENA_REF * ref,RE_ERROR * error)224 int yr_re_compile(
225 const char* re_string,
226 int flags,
227 YR_ARENA* arena,
228 YR_ARENA_REF* ref,
229 RE_ERROR* error)
230 {
231 RE_AST* re_ast;
232 RE _re;
233
234 FAIL_ON_ERROR(yr_re_parse(re_string, &re_ast, error));
235
236 _re.flags = flags;
237
238 FAIL_ON_ERROR_WITH_CLEANUP(
239 yr_arena_write_data(arena, YR_RE_CODE_SECTION, &_re, sizeof(_re), ref),
240 yr_re_ast_destroy(re_ast));
241
242 FAIL_ON_ERROR_WITH_CLEANUP(
243 yr_re_ast_emit_code(re_ast, arena, false), yr_re_ast_destroy(re_ast));
244
245 yr_re_ast_destroy(re_ast);
246
247 return ERROR_SUCCESS;
248 }
249
250 ////////////////////////////////////////////////////////////////////////////////
251 // Verifies if the target string matches the pattern
252 //
253 // Args:
254 // context: Scan context
255 // re: A pointer to a compiled regexp
256 // target: Target string
257 //
258 // Returns:
259 // See return codes for yr_re_exec
260 //
yr_re_match(YR_SCAN_CONTEXT * context,RE * re,const char * target)261 int yr_re_match(YR_SCAN_CONTEXT* context, RE* re, const char* target)
262 {
263 int result;
264
265 yr_re_exec(
266 context,
267 re->code,
268 (uint8_t*) target,
269 strlen(target),
270 0,
271 re->flags | RE_FLAGS_SCAN,
272 NULL,
273 NULL,
274 &result);
275
276 return result;
277 }
278
279 ////////////////////////////////////////////////////////////////////////////////
280 // Verifies if the provided regular expression is just a literal string
281 // like "abc", "12345", without any wildcard, operator, etc. In that case
282 // returns the string as a SIZED_STRING, or returns NULL if otherwise.
283 //
284 // The caller is responsible for deallocating the returned SIZED_STRING by
285 // calling yr_free.
286 //
yr_re_ast_extract_literal(RE_AST * re_ast)287 SIZED_STRING* yr_re_ast_extract_literal(RE_AST* re_ast)
288 {
289 SIZED_STRING* string;
290 RE_NODE* child;
291
292 int length = 0;
293
294 if (re_ast->root_node->type == RE_NODE_LITERAL)
295 {
296 length = 1;
297 }
298 else if (re_ast->root_node->type == RE_NODE_CONCAT)
299 {
300 child = re_ast->root_node->children_tail;
301
302 while (child != NULL && child->type == RE_NODE_LITERAL)
303 {
304 length++;
305 child = child->prev_sibling;
306 }
307
308 if (child != NULL)
309 return NULL;
310 }
311 else
312 {
313 return NULL;
314 }
315
316 string = (SIZED_STRING*) yr_malloc(sizeof(SIZED_STRING) + length);
317
318 if (string == NULL)
319 return NULL;
320
321 string->length = length;
322 string->flags = 0;
323
324 if (re_ast->root_node->type == RE_NODE_LITERAL)
325 {
326 string->c_string[0] = re_ast->root_node->value;
327 }
328 else
329 {
330 child = re_ast->root_node->children_tail;
331
332 while (child != NULL)
333 {
334 string->c_string[--length] = child->value;
335 child = child->prev_sibling;
336 }
337 }
338
339 string->c_string[string->length] = '\0';
340
341 return string;
342 }
343
_yr_re_node_has_unbounded_quantifier_for_dot(RE_NODE * re_node)344 int _yr_re_node_has_unbounded_quantifier_for_dot(RE_NODE* re_node)
345 {
346 RE_NODE* child;
347
348 if ((re_node->type == RE_NODE_STAR || re_node->type == RE_NODE_PLUS) &&
349 re_node->children_head->type == RE_NODE_ANY)
350 return true;
351
352 if (re_node->type == RE_NODE_RANGE_ANY && re_node->end == RE_MAX_RANGE)
353 return true;
354
355 if (re_node->type == RE_NODE_CONCAT)
356 {
357 child = re_node->children_tail;
358
359 while (child != NULL)
360 {
361 if (_yr_re_node_has_unbounded_quantifier_for_dot(child))
362 return true;
363
364 child = child->prev_sibling;
365 }
366 }
367
368 return false;
369 }
370
371 ////////////////////////////////////////////////////////////////////////////////
372 // Detects the use of .*, .+ or .{x,} in a regexp. The use of wildcards with
373 // quantifiers that don't have a reasonably small upper bound causes a
374 // performance penalty. This function dectects such cases in order to warn the
375 // user about this.
376 //
yr_re_ast_has_unbounded_quantifier_for_dot(RE_AST * re_ast)377 int yr_re_ast_has_unbounded_quantifier_for_dot(RE_AST* re_ast)
378 {
379 return _yr_re_node_has_unbounded_quantifier_for_dot(re_ast->root_node);
380 }
381
382 ////////////////////////////////////////////////////////////////////////////////
383 // In some cases splitting a regular expression (or hex string) in two parts is
384 // convenient for increasing performance. This happens when the pattern contains
385 // a large gap (a.k.a jump), for example: { 01 02 03 [0-999] 04 05 06 }
386 // In this case the string is splitted in { 01 02 03 } and { 04 05 06 } where
387 // the latter is chained to the former. This means that { 01 02 03 } and
388 // { 04 05 06 } are handled as individual strings, and when both of them are
389 // found, YARA verifies if the distance between the matches complies with the
390 // [0-999] restriction.
391 //
392 // This function traverses a regexp's AST looking for nodes where it should be
393 // splitted. It must be noticed that this only applies to two-level ASTs (i.e.
394 // an AST consisting in a RE_NODE_CONCAT at the root where all the children are
395 // leaves).
396 //
397 // For example, { 01 02 03 [0-1000] 04 05 06 [500-2000] 07 08 09 } has the
398 // following AST:
399 //
400 // RE_NODE_CONCAT
401 // |
402 // |- RE_NODE_LITERAL (01)
403 // |- RE_NODE_LITERAL (02)
404 // |- RE_NODE_LITERAL (03)
405 // |- RE_NODE_RANGE_ANY (start=0, end=1000)
406 // |- RE_NODE_LITERAL (04)
407 // |- RE_NODE_LITERAL (05)
408 // |- RE_NODE_LITERAL (06)
409 // |- RE_NODE_RANGE_ANY (start=500, end=2000)
410 // |- RE_NODE_LITERAL (07)
411 // |- RE_NODE_LITERAL (08)
412 // |- RE_NODE_LITERAL (09)
413 //
414 // If the AST above is passed in the re_ast argument, it will be trimmed to:
415 //
416 // RE_NODE_CONCAT
417 // |
418 // |- RE_NODE_LITERAL (01)
419 // |- RE_NODE_LITERAL (02)
420 // |- RE_NODE_LITERAL (03)
421 //
422 // While remainder_re_ast will be:
423 //
424 // RE_NODE_CONCAT
425 // |
426 // |- RE_NODE_LITERAL (04)
427 // |- RE_NODE_LITERAL (05)
428 // |- RE_NODE_LITERAL (06)
429 // |- RE_NODE_RANGE_ANY (start=500, end=2000)
430 // |- RE_NODE_LITERAL (07)
431 // |- RE_NODE_LITERAL (08)
432 // |- RE_NODE_LITERAL (09)
433 //
434 // The caller is responsible for freeing the new AST in remainder_re_ast by
435 // calling yr_re_ast_destroy.
436 //
437 // The integers pointed to by min_gap and max_gap will be filled with the
438 // minimum and maximum gap size between the sub-strings represented by the
439 // two ASTs.
440 //
yr_re_ast_split_at_chaining_point(RE_AST * re_ast,RE_AST ** remainder_re_ast,int32_t * min_gap,int32_t * max_gap)441 int yr_re_ast_split_at_chaining_point(
442 RE_AST* re_ast,
443 RE_AST** remainder_re_ast,
444 int32_t* min_gap,
445 int32_t* max_gap)
446 {
447 RE_NODE* child;
448 RE_NODE* concat;
449
450 int result;
451
452 *remainder_re_ast = NULL;
453 *min_gap = 0;
454 *max_gap = 0;
455
456 if (re_ast->root_node->type != RE_NODE_CONCAT)
457 return ERROR_SUCCESS;
458
459 child = re_ast->root_node->children_head;
460
461 while (child != NULL)
462 {
463 if (!child->greedy && child->type == RE_NODE_RANGE_ANY &&
464 child->prev_sibling != NULL && child->next_sibling != NULL &&
465 (child->start > YR_STRING_CHAINING_THRESHOLD ||
466 child->end > YR_STRING_CHAINING_THRESHOLD))
467 {
468 result = yr_re_ast_create(remainder_re_ast);
469
470 if (result != ERROR_SUCCESS)
471 return result;
472
473 concat = yr_re_node_create(RE_NODE_CONCAT);
474
475 if (concat == NULL)
476 return ERROR_INSUFFICIENT_MEMORY;
477
478 concat->children_head = child->next_sibling;
479 concat->children_tail = re_ast->root_node->children_tail;
480
481 re_ast->root_node->children_tail = child->prev_sibling;
482
483 child->prev_sibling->next_sibling = NULL;
484 child->next_sibling->prev_sibling = NULL;
485
486 *min_gap = child->start;
487 *max_gap = child->end;
488
489 (*remainder_re_ast)->root_node = concat;
490 (*remainder_re_ast)->flags = re_ast->flags;
491
492 yr_re_node_destroy(child);
493
494 return ERROR_SUCCESS;
495 }
496
497 child = child->next_sibling;
498 }
499
500 return ERROR_SUCCESS;
501 }
502
_yr_emit_inst(RE_EMIT_CONTEXT * emit_context,uint8_t opcode,YR_ARENA_REF * instruction_ref)503 int _yr_emit_inst(
504 RE_EMIT_CONTEXT* emit_context,
505 uint8_t opcode,
506 YR_ARENA_REF* instruction_ref)
507 {
508 FAIL_ON_ERROR(yr_arena_write_data(
509 emit_context->arena,
510 YR_RE_CODE_SECTION,
511 &opcode,
512 sizeof(uint8_t),
513 instruction_ref));
514
515 return ERROR_SUCCESS;
516 }
517
_yr_emit_inst_arg_uint8(RE_EMIT_CONTEXT * emit_context,uint8_t opcode,uint8_t argument,YR_ARENA_REF * instruction_ref,YR_ARENA_REF * argument_ref)518 int _yr_emit_inst_arg_uint8(
519 RE_EMIT_CONTEXT* emit_context,
520 uint8_t opcode,
521 uint8_t argument,
522 YR_ARENA_REF* instruction_ref,
523 YR_ARENA_REF* argument_ref)
524 {
525 FAIL_ON_ERROR(yr_arena_write_data(
526 emit_context->arena,
527 YR_RE_CODE_SECTION,
528 &opcode,
529 sizeof(uint8_t),
530 instruction_ref));
531
532 FAIL_ON_ERROR(yr_arena_write_data(
533 emit_context->arena,
534 YR_RE_CODE_SECTION,
535 &argument,
536 sizeof(uint8_t),
537 argument_ref));
538
539 return ERROR_SUCCESS;
540 }
541
_yr_emit_inst_arg_uint16(RE_EMIT_CONTEXT * emit_context,uint8_t opcode,uint16_t argument,YR_ARENA_REF * instruction_ref,YR_ARENA_REF * argument_ref)542 int _yr_emit_inst_arg_uint16(
543 RE_EMIT_CONTEXT* emit_context,
544 uint8_t opcode,
545 uint16_t argument,
546 YR_ARENA_REF* instruction_ref,
547 YR_ARENA_REF* argument_ref)
548 {
549 FAIL_ON_ERROR(yr_arena_write_data(
550 emit_context->arena,
551 YR_RE_CODE_SECTION,
552 &opcode,
553 sizeof(uint8_t),
554 instruction_ref));
555
556 FAIL_ON_ERROR(yr_arena_write_data(
557 emit_context->arena,
558 YR_RE_CODE_SECTION,
559 &argument,
560 sizeof(uint16_t),
561 argument_ref));
562
563 return ERROR_SUCCESS;
564 }
565
_yr_emit_inst_arg_uint32(RE_EMIT_CONTEXT * emit_context,uint8_t opcode,uint32_t argument,YR_ARENA_REF * instruction_ref,YR_ARENA_REF * argument_ref)566 int _yr_emit_inst_arg_uint32(
567 RE_EMIT_CONTEXT* emit_context,
568 uint8_t opcode,
569 uint32_t argument,
570 YR_ARENA_REF* instruction_ref,
571 YR_ARENA_REF* argument_ref)
572 {
573 FAIL_ON_ERROR(yr_arena_write_data(
574 emit_context->arena,
575 YR_RE_CODE_SECTION,
576 &opcode,
577 sizeof(uint8_t),
578 instruction_ref));
579
580 FAIL_ON_ERROR(yr_arena_write_data(
581 emit_context->arena,
582 YR_RE_CODE_SECTION,
583 &argument,
584 sizeof(uint32_t),
585 argument_ref));
586
587 return ERROR_SUCCESS;
588 }
589
_yr_emit_inst_arg_int16(RE_EMIT_CONTEXT * emit_context,uint8_t opcode,int16_t argument,YR_ARENA_REF * instruction_ref,YR_ARENA_REF * argument_ref)590 int _yr_emit_inst_arg_int16(
591 RE_EMIT_CONTEXT* emit_context,
592 uint8_t opcode,
593 int16_t argument,
594 YR_ARENA_REF* instruction_ref,
595 YR_ARENA_REF* argument_ref)
596 {
597 FAIL_ON_ERROR(yr_arena_write_data(
598 emit_context->arena,
599 YR_RE_CODE_SECTION,
600 &opcode,
601 sizeof(uint8_t),
602 instruction_ref));
603
604 FAIL_ON_ERROR(yr_arena_write_data(
605 emit_context->arena,
606 YR_RE_CODE_SECTION,
607 &argument,
608 sizeof(int16_t),
609 argument_ref));
610
611 return ERROR_SUCCESS;
612 }
613
_yr_emit_inst_arg_struct(RE_EMIT_CONTEXT * emit_context,uint8_t opcode,void * structure,size_t structure_size,YR_ARENA_REF * instruction_ref,YR_ARENA_REF * argument_ref)614 int _yr_emit_inst_arg_struct(
615 RE_EMIT_CONTEXT* emit_context,
616 uint8_t opcode,
617 void* structure,
618 size_t structure_size,
619 YR_ARENA_REF* instruction_ref,
620 YR_ARENA_REF* argument_ref)
621 {
622 FAIL_ON_ERROR(yr_arena_write_data(
623 emit_context->arena,
624 YR_RE_CODE_SECTION,
625 &opcode,
626 sizeof(uint8_t),
627 instruction_ref));
628
629 FAIL_ON_ERROR(yr_arena_write_data(
630 emit_context->arena,
631 YR_RE_CODE_SECTION,
632 structure,
633 structure_size,
634 argument_ref));
635
636 return ERROR_SUCCESS;
637 }
638
_yr_emit_split(RE_EMIT_CONTEXT * emit_context,uint8_t opcode,int16_t argument,YR_ARENA_REF * instruction_ref,YR_ARENA_REF * argument_ref)639 int _yr_emit_split(
640 RE_EMIT_CONTEXT* emit_context,
641 uint8_t opcode,
642 int16_t argument,
643 YR_ARENA_REF* instruction_ref,
644 YR_ARENA_REF* argument_ref)
645 {
646 assert(opcode == RE_OPCODE_SPLIT_A || opcode == RE_OPCODE_SPLIT_B);
647
648 if (emit_context->next_split_id == RE_MAX_SPLIT_ID)
649 return ERROR_REGULAR_EXPRESSION_TOO_COMPLEX;
650
651 FAIL_ON_ERROR(yr_arena_write_data(
652 emit_context->arena,
653 YR_RE_CODE_SECTION,
654 &opcode,
655 sizeof(uint8_t),
656 instruction_ref));
657
658 FAIL_ON_ERROR(yr_arena_write_data(
659 emit_context->arena,
660 YR_RE_CODE_SECTION,
661 &emit_context->next_split_id,
662 sizeof(RE_SPLIT_ID_TYPE),
663 NULL));
664
665 emit_context->next_split_id++;
666
667 FAIL_ON_ERROR(yr_arena_write_data(
668 emit_context->arena,
669 YR_RE_CODE_SECTION,
670 &argument,
671 sizeof(int16_t),
672 argument_ref));
673
674 return ERROR_SUCCESS;
675 }
676
677 #define current_re_code_offset() \
678 yr_arena_get_current_offset(emit_context->arena, YR_RE_CODE_SECTION)
679
_yr_re_emit(RE_EMIT_CONTEXT * emit_context,RE_NODE * re_node,int flags,YR_ARENA_REF * code_ref)680 static int _yr_re_emit(
681 RE_EMIT_CONTEXT* emit_context,
682 RE_NODE* re_node,
683 int flags,
684 YR_ARENA_REF* code_ref)
685 {
686 yr_arena_off_t jmp_offset;
687
688 yr_arena_off_t bookmark_1 = 0;
689 yr_arena_off_t bookmark_2 = 0;
690 yr_arena_off_t bookmark_3 = 0;
691 yr_arena_off_t bookmark_4 = 0;
692
693 bool emit_split;
694 bool emit_repeat;
695 bool emit_prolog;
696 bool emit_epilog;
697
698 RE_REPEAT_ARGS repeat_args;
699 RE_REPEAT_ARGS* repeat_start_args_addr;
700 RE_REPEAT_ANY_ARGS repeat_any_args;
701
702 RE_NODE* child;
703
704 int16_t* split_offset_addr = NULL;
705 int16_t* jmp_offset_addr = NULL;
706
707 YR_ARENA_REF instruction_ref = YR_ARENA_NULL_REF;
708 YR_ARENA_REF split_offset_ref;
709 YR_ARENA_REF jmp_instruction_ref;
710 YR_ARENA_REF jmp_offset_ref;
711 YR_ARENA_REF repeat_start_args_ref;
712
713 switch (re_node->type)
714 {
715 case RE_NODE_LITERAL:
716 FAIL_ON_ERROR(_yr_emit_inst_arg_uint8(
717 emit_context,
718 RE_OPCODE_LITERAL,
719 re_node->value,
720 &instruction_ref,
721 NULL));
722 break;
723
724 case RE_NODE_MASKED_LITERAL:
725 FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
726 emit_context,
727 RE_OPCODE_MASKED_LITERAL,
728 re_node->mask << 8 | re_node->value,
729 &instruction_ref,
730 NULL));
731 break;
732
733 case RE_NODE_WORD_CHAR:
734 FAIL_ON_ERROR(
735 _yr_emit_inst(emit_context, RE_OPCODE_WORD_CHAR, &instruction_ref));
736 break;
737
738 case RE_NODE_NON_WORD_CHAR:
739 FAIL_ON_ERROR(
740 _yr_emit_inst(emit_context, RE_OPCODE_NON_WORD_CHAR, &instruction_ref));
741 break;
742
743 case RE_NODE_WORD_BOUNDARY:
744 FAIL_ON_ERROR(
745 _yr_emit_inst(emit_context, RE_OPCODE_WORD_BOUNDARY, &instruction_ref));
746 break;
747
748 case RE_NODE_NON_WORD_BOUNDARY:
749 FAIL_ON_ERROR(_yr_emit_inst(
750 emit_context, RE_OPCODE_NON_WORD_BOUNDARY, &instruction_ref));
751 break;
752
753 case RE_NODE_SPACE:
754 FAIL_ON_ERROR(
755 _yr_emit_inst(emit_context, RE_OPCODE_SPACE, &instruction_ref));
756 break;
757
758 case RE_NODE_NON_SPACE:
759 FAIL_ON_ERROR(
760 _yr_emit_inst(emit_context, RE_OPCODE_NON_SPACE, &instruction_ref));
761 break;
762
763 case RE_NODE_DIGIT:
764 FAIL_ON_ERROR(
765 _yr_emit_inst(emit_context, RE_OPCODE_DIGIT, &instruction_ref));
766 break;
767
768 case RE_NODE_NON_DIGIT:
769 FAIL_ON_ERROR(
770 _yr_emit_inst(emit_context, RE_OPCODE_NON_DIGIT, &instruction_ref));
771 break;
772
773 case RE_NODE_ANY:
774 FAIL_ON_ERROR(_yr_emit_inst(emit_context, RE_OPCODE_ANY, &instruction_ref));
775 break;
776
777 case RE_NODE_CLASS:
778 FAIL_ON_ERROR(
779 _yr_emit_inst(emit_context, RE_OPCODE_CLASS, &instruction_ref));
780
781 FAIL_ON_ERROR(yr_arena_write_data(
782 emit_context->arena,
783 YR_RE_CODE_SECTION,
784 re_node->re_class,
785 sizeof(*re_node->re_class),
786 NULL));
787 break;
788
789 case RE_NODE_ANCHOR_START:
790 FAIL_ON_ERROR(_yr_emit_inst(
791 emit_context, RE_OPCODE_MATCH_AT_START, &instruction_ref));
792 break;
793
794 case RE_NODE_ANCHOR_END:
795 FAIL_ON_ERROR(
796 _yr_emit_inst(emit_context, RE_OPCODE_MATCH_AT_END, &instruction_ref));
797 break;
798
799 case RE_NODE_CONCAT:
800 FAIL_ON_ERROR(_yr_re_emit(
801 emit_context,
802 (flags & EMIT_BACKWARDS) ? re_node->children_tail
803 : re_node->children_head,
804 flags,
805 &instruction_ref));
806
807 if (flags & EMIT_BACKWARDS)
808 child = re_node->children_tail->prev_sibling;
809 else
810 child = re_node->children_head->next_sibling;
811
812 while (child != NULL)
813 {
814 FAIL_ON_ERROR(_yr_re_emit(emit_context, child, flags, NULL));
815
816 child = (flags & EMIT_BACKWARDS) ? child->prev_sibling
817 : child->next_sibling;
818 }
819 break;
820
821 case RE_NODE_PLUS:
822 // Code for e+ looks like:
823 //
824 // L1: code for e
825 // split L1, L2
826 // L2:
827 //
828 FAIL_ON_ERROR(_yr_re_emit(
829 emit_context, re_node->children_head, flags, &instruction_ref));
830
831 jmp_offset = instruction_ref.offset - current_re_code_offset();
832
833 if (jmp_offset < INT16_MIN)
834 return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
835
836 FAIL_ON_ERROR(_yr_emit_split(
837 emit_context,
838 re_node->greedy ? RE_OPCODE_SPLIT_B : RE_OPCODE_SPLIT_A,
839 (int16_t) jmp_offset,
840 NULL,
841 NULL));
842
843 break;
844
845 case RE_NODE_STAR:
846 // Code for e* looks like:
847 //
848 // L1: split L1, L2
849 // code for e
850 // jmp L1
851 // L2:
852 FAIL_ON_ERROR(_yr_emit_split(
853 emit_context,
854 re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
855 0,
856 &instruction_ref,
857 &split_offset_ref));
858
859 FAIL_ON_ERROR(
860 _yr_re_emit(emit_context, re_node->children_head, flags, NULL));
861
862 jmp_offset = instruction_ref.offset - current_re_code_offset();
863
864 if (jmp_offset < INT16_MIN)
865 return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
866
867 // Emit jump with offset set to 0.
868
869 FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
870 emit_context, RE_OPCODE_JUMP, (int16_t) jmp_offset, NULL, NULL));
871
872 jmp_offset = current_re_code_offset() - instruction_ref.offset;
873
874 if (jmp_offset > INT16_MAX)
875 return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
876
877 // Update split offset.
878 split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
879 emit_context->arena, &split_offset_ref);
880
881 *split_offset_addr = (int16_t) jmp_offset;
882 break;
883
884 case RE_NODE_ALT:
885 // Code for e1|e2 looks like:
886 //
887 // split L1, L2
888 // L1: code for e1
889 // jmp L3
890 // L2: code for e2
891 // L3:
892
893 // Emit a split instruction with offset set to 0 temporarily. Offset
894 // will be updated after we know the size of the code generated for
895 // the left node (e1).
896
897 FAIL_ON_ERROR(_yr_emit_split(
898 emit_context,
899 RE_OPCODE_SPLIT_A,
900 0,
901 &instruction_ref,
902 &split_offset_ref));
903
904 FAIL_ON_ERROR(
905 _yr_re_emit(emit_context, re_node->children_head, flags, NULL));
906
907 // Emit jump with offset set to 0.
908
909 FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
910 emit_context,
911 RE_OPCODE_JUMP,
912 0,
913 &jmp_instruction_ref,
914 &jmp_offset_ref));
915
916 jmp_offset = current_re_code_offset() - instruction_ref.offset;
917
918 if (jmp_offset > INT16_MAX)
919 return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
920
921 // Update split offset.
922 split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
923 emit_context->arena, &split_offset_ref);
924
925 *split_offset_addr = (int16_t) jmp_offset;
926
927 FAIL_ON_ERROR(
928 _yr_re_emit(emit_context, re_node->children_tail, flags, NULL));
929
930 jmp_offset = current_re_code_offset() - jmp_instruction_ref.offset;
931
932 if (jmp_offset > INT16_MAX)
933 return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
934
935 // Update offset for jmp instruction.
936 jmp_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
937 emit_context->arena, &jmp_offset_ref);
938
939 *jmp_offset_addr = (int16_t) jmp_offset;
940 break;
941
942 case RE_NODE_RANGE_ANY:
943 repeat_any_args.min = re_node->start;
944 repeat_any_args.max = re_node->end;
945
946 FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
947 emit_context,
948 re_node->greedy ? RE_OPCODE_REPEAT_ANY_GREEDY
949 : RE_OPCODE_REPEAT_ANY_UNGREEDY,
950 &repeat_any_args,
951 sizeof(repeat_any_args),
952 &instruction_ref,
953 NULL));
954
955 break;
956
957 case RE_NODE_RANGE:
958 // Code for e{n,m} looks like:
959 //
960 // code for e --- prolog
961 // repeat_start n, m, L1 --+
962 // L0: code for e | repeat
963 // repeat_end n, m, L0 --+
964 // L1: split L2, L3 --- split
965 // L2: code for e --- epilog
966 // L3:
967 //
968 // Not all sections (prolog, repeat, split and epilog) are generated in all
969 // cases, it depends on the values of n and m. The following table shows
970 // which sections are generated for the first few values of n and m.
971 //
972 // n,m prolog repeat split epilog
973 // (min,max)
974 // ---------------------------------------
975 // 0,0 - - - -
976 // 0,1 - - X X
977 // 0,2 - 0,1 X X
978 // 0,3 - 0,2 X X
979 // 0,M - 0,M-1 X X
980 //
981 // 1,1 X - - -
982 // 1,2 X - X X
983 // 1,3 X 0,1 X X
984 // 1,4 X 1,2 X X
985 // 1,M X 1,M-2 X X
986 //
987 // 2,2 X - - X
988 // 2,3 X 1,1 X X
989 // 2,4 X 1,2 X X
990 // 2,M X 1,M-2 X X
991 //
992 // 3,3 X 1,1 - X
993 // 3,4 X 2,2 X X
994 // 3,M X 2,M-2 X X
995 //
996 // 4,4 X 2,2 - X
997 // 4,5 X 3,3 X X
998 // 4,M X 3,M-2 X X
999 //
1000 // The code can't consists simply in the repeat section, the prolog and
1001 // epilog are required because we can't have atoms pointing to code inside
1002 // the repeat loop. Atoms' forwards_code will point to code in the prolog
1003 // and backwards_code will point to code in the epilog (or in prolog if
1004 // epilog wasn't generated, like in n=1,m=1)
1005
1006 emit_prolog = re_node->start > 0;
1007 emit_repeat = re_node->end > re_node->start + 1 || re_node->end > 2;
1008 emit_split = re_node->end > re_node->start;
1009 emit_epilog = re_node->end > re_node->start || re_node->end > 1;
1010
1011 if (emit_prolog)
1012 {
1013 FAIL_ON_ERROR(_yr_re_emit(
1014 emit_context, re_node->children_head, flags, &instruction_ref));
1015 }
1016
1017 if (emit_repeat)
1018 {
1019 repeat_args.min = re_node->start;
1020 repeat_args.max = re_node->end;
1021
1022 if (emit_prolog)
1023 {
1024 repeat_args.max--;
1025 repeat_args.min--;
1026 }
1027
1028 if (emit_split)
1029 {
1030 repeat_args.max--;
1031 }
1032 else
1033 {
1034 repeat_args.min--;
1035 repeat_args.max--;
1036 }
1037
1038 repeat_args.offset = 0;
1039
1040 bookmark_1 = current_re_code_offset();
1041
1042 FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
1043 emit_context,
1044 re_node->greedy ? RE_OPCODE_REPEAT_START_GREEDY
1045 : RE_OPCODE_REPEAT_START_UNGREEDY,
1046 &repeat_args,
1047 sizeof(repeat_args),
1048 emit_prolog ? NULL : &instruction_ref,
1049 &repeat_start_args_ref));
1050
1051 bookmark_2 = current_re_code_offset();
1052
1053 FAIL_ON_ERROR(_yr_re_emit(
1054 emit_context,
1055 re_node->children_head,
1056 flags | EMIT_DONT_SET_FORWARDS_CODE | EMIT_DONT_SET_BACKWARDS_CODE,
1057 NULL));
1058
1059 bookmark_3 = current_re_code_offset();
1060
1061 if (bookmark_2 - bookmark_3 < INT32_MIN)
1062 return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1063
1064 repeat_args.offset = (int32_t)(bookmark_2 - bookmark_3);
1065
1066 FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
1067 emit_context,
1068 re_node->greedy ? RE_OPCODE_REPEAT_END_GREEDY
1069 : RE_OPCODE_REPEAT_END_UNGREEDY,
1070 &repeat_args,
1071 sizeof(repeat_args),
1072 NULL,
1073 NULL));
1074
1075 bookmark_4 = current_re_code_offset();
1076
1077 repeat_start_args_addr = (RE_REPEAT_ARGS*) yr_arena_ref_to_ptr(
1078 emit_context->arena, &repeat_start_args_ref);
1079
1080 if (bookmark_4 - bookmark_1 > INT32_MAX)
1081 return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1082
1083 repeat_start_args_addr->offset = (int32_t)(bookmark_4 - bookmark_1);
1084 }
1085
1086 if (emit_split)
1087 {
1088 bookmark_1 = current_re_code_offset();
1089
1090 FAIL_ON_ERROR(_yr_emit_split(
1091 emit_context,
1092 re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
1093 0,
1094 NULL,
1095 &split_offset_ref));
1096 }
1097
1098 if (emit_epilog)
1099 {
1100 FAIL_ON_ERROR(_yr_re_emit(
1101 emit_context,
1102 re_node->children_head,
1103 emit_prolog ? flags | EMIT_DONT_SET_FORWARDS_CODE : flags,
1104 emit_prolog || emit_repeat ? NULL : &instruction_ref));
1105 }
1106
1107 if (emit_split)
1108 {
1109 bookmark_2 = current_re_code_offset();
1110
1111 if (bookmark_2 - bookmark_1 > INT16_MAX)
1112 return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1113
1114 split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
1115 emit_context->arena, &split_offset_ref);
1116
1117 *split_offset_addr = (int16_t)(bookmark_2 - bookmark_1);
1118 }
1119
1120 break;
1121 }
1122
1123 if (flags & EMIT_BACKWARDS)
1124 {
1125 if (!(flags & EMIT_DONT_SET_BACKWARDS_CODE))
1126 {
1127 re_node->backward_code_ref.buffer_id = YR_RE_CODE_SECTION;
1128 re_node->backward_code_ref.offset = yr_arena_get_current_offset(
1129 emit_context->arena, YR_RE_CODE_SECTION);
1130 }
1131 }
1132 else
1133 {
1134 if (!(flags & EMIT_DONT_SET_FORWARDS_CODE))
1135 {
1136 re_node->forward_code_ref = instruction_ref;
1137 }
1138 }
1139
1140 if (code_ref != NULL)
1141 *code_ref = instruction_ref;
1142
1143 return ERROR_SUCCESS;
1144 }
1145
yr_re_ast_emit_code(RE_AST * re_ast,YR_ARENA * arena,int backwards_code)1146 int yr_re_ast_emit_code(RE_AST* re_ast, YR_ARENA* arena, int backwards_code)
1147 {
1148 RE_EMIT_CONTEXT emit_context;
1149
1150 // Emit code for matching the regular expressions forwards.
1151 emit_context.arena = arena;
1152 emit_context.next_split_id = 0;
1153
1154 FAIL_ON_ERROR(_yr_re_emit(
1155 &emit_context,
1156 re_ast->root_node,
1157 backwards_code ? EMIT_BACKWARDS : 0,
1158 NULL));
1159
1160 FAIL_ON_ERROR(_yr_emit_inst(&emit_context, RE_OPCODE_MATCH, NULL));
1161
1162 return ERROR_SUCCESS;
1163 }
1164
_yr_re_fiber_create(RE_FIBER_POOL * fiber_pool,RE_FIBER ** new_fiber)1165 static int _yr_re_fiber_create(RE_FIBER_POOL* fiber_pool, RE_FIBER** new_fiber)
1166 {
1167 RE_FIBER* fiber;
1168
1169 if (fiber_pool->fibers.head != NULL)
1170 {
1171 fiber = fiber_pool->fibers.head;
1172 fiber_pool->fibers.head = fiber->next;
1173
1174 if (fiber_pool->fibers.tail == fiber)
1175 fiber_pool->fibers.tail = NULL;
1176 }
1177 else
1178 {
1179 if (fiber_pool->fiber_count == RE_MAX_FIBERS)
1180 return ERROR_TOO_MANY_RE_FIBERS;
1181
1182 fiber = (RE_FIBER*) yr_malloc(sizeof(RE_FIBER));
1183
1184 if (fiber == NULL)
1185 return ERROR_INSUFFICIENT_MEMORY;
1186
1187 fiber_pool->fiber_count++;
1188 }
1189
1190 fiber->ip = NULL;
1191 fiber->sp = -1;
1192 fiber->rc = -1;
1193 fiber->next = NULL;
1194 fiber->prev = NULL;
1195
1196 *new_fiber = fiber;
1197
1198 return ERROR_SUCCESS;
1199 }
1200
1201 ////////////////////////////////////////////////////////////////////////////////
1202 // Appends 'fiber' to 'fiber_list'
1203 //
_yr_re_fiber_append(RE_FIBER_LIST * fiber_list,RE_FIBER * fiber)1204 static void _yr_re_fiber_append(RE_FIBER_LIST* fiber_list, RE_FIBER* fiber)
1205 {
1206 assert(fiber->prev == NULL);
1207 assert(fiber->next == NULL);
1208
1209 fiber->prev = fiber_list->tail;
1210
1211 if (fiber_list->tail != NULL)
1212 fiber_list->tail->next = fiber;
1213
1214 fiber_list->tail = fiber;
1215
1216 if (fiber_list->head == NULL)
1217 fiber_list->head = fiber;
1218
1219 assert(fiber_list->tail->next == NULL);
1220 assert(fiber_list->head->prev == NULL);
1221 }
1222
1223 ////////////////////////////////////////////////////////////////////////////////
1224 // Verifies if a fiber with the same properties (ip, rc, sp, and stack values)
1225 // than 'target_fiber' exists in 'fiber_list'. The list is iterated from
1226 // the start until 'last_fiber' (inclusive). Fibers past 'last_fiber' are not
1227 // taken into account.
1228 //
_yr_re_fiber_exists(RE_FIBER_LIST * fiber_list,RE_FIBER * target_fiber,RE_FIBER * last_fiber)1229 static int _yr_re_fiber_exists(
1230 RE_FIBER_LIST* fiber_list,
1231 RE_FIBER* target_fiber,
1232 RE_FIBER* last_fiber)
1233 {
1234 RE_FIBER* fiber = fiber_list->head;
1235
1236 int equal_stacks;
1237 int i;
1238
1239 if (last_fiber == NULL)
1240 return false;
1241
1242 while (fiber != last_fiber->next)
1243 {
1244 if (fiber->ip == target_fiber->ip && fiber->sp == target_fiber->sp &&
1245 fiber->rc == target_fiber->rc)
1246 {
1247 equal_stacks = true;
1248
1249 for (i = 0; i <= fiber->sp; i++)
1250 {
1251 if (fiber->stack[i] != target_fiber->stack[i])
1252 {
1253 equal_stacks = false;
1254 break;
1255 }
1256 }
1257
1258 if (equal_stacks)
1259 return true;
1260 }
1261
1262 fiber = fiber->next;
1263 }
1264
1265 return false;
1266 }
1267
1268 ////////////////////////////////////////////////////////////////////////////////
1269 // Clones a fiber in fiber_list and inserts the cloned fiber just after.
1270 // the original one. If fiber_list is:
1271 //
1272 // f1 -> f2 -> f3 -> f4
1273 //
1274 // Splitting f2 will result in:
1275 //
1276 // f1 -> f2 -> cloned f2 -> f3 -> f4
1277 //
_yr_re_fiber_split(RE_FIBER_LIST * fiber_list,RE_FIBER_POOL * fiber_pool,RE_FIBER * fiber,RE_FIBER ** new_fiber)1278 static int _yr_re_fiber_split(
1279 RE_FIBER_LIST* fiber_list,
1280 RE_FIBER_POOL* fiber_pool,
1281 RE_FIBER* fiber,
1282 RE_FIBER** new_fiber)
1283 {
1284 int32_t i;
1285
1286 FAIL_ON_ERROR(_yr_re_fiber_create(fiber_pool, new_fiber));
1287
1288 (*new_fiber)->sp = fiber->sp;
1289 (*new_fiber)->ip = fiber->ip;
1290 (*new_fiber)->rc = fiber->rc;
1291
1292 for (i = 0; i <= fiber->sp; i++) (*new_fiber)->stack[i] = fiber->stack[i];
1293
1294 (*new_fiber)->next = fiber->next;
1295 (*new_fiber)->prev = fiber;
1296
1297 if (fiber->next != NULL)
1298 fiber->next->prev = *new_fiber;
1299
1300 fiber->next = *new_fiber;
1301
1302 if (fiber_list->tail == fiber)
1303 fiber_list->tail = *new_fiber;
1304
1305 assert(fiber_list->tail->next == NULL);
1306 assert(fiber_list->head->prev == NULL);
1307
1308 return ERROR_SUCCESS;
1309 }
1310
1311 ////////////////////////////////////////////////////////////////////////////////
1312 // Kills a given fiber by removing it from the fiber list and putting it in the
1313 // fiber pool.
1314 //
_yr_re_fiber_kill(RE_FIBER_LIST * fiber_list,RE_FIBER_POOL * fiber_pool,RE_FIBER * fiber)1315 static RE_FIBER* _yr_re_fiber_kill(
1316 RE_FIBER_LIST* fiber_list,
1317 RE_FIBER_POOL* fiber_pool,
1318 RE_FIBER* fiber)
1319 {
1320 RE_FIBER* next_fiber = fiber->next;
1321
1322 if (fiber->prev != NULL)
1323 fiber->prev->next = next_fiber;
1324
1325 if (next_fiber != NULL)
1326 next_fiber->prev = fiber->prev;
1327
1328 if (fiber_pool->fibers.tail != NULL)
1329 fiber_pool->fibers.tail->next = fiber;
1330
1331 if (fiber_list->tail == fiber)
1332 fiber_list->tail = fiber->prev;
1333
1334 if (fiber_list->head == fiber)
1335 fiber_list->head = next_fiber;
1336
1337 fiber->next = NULL;
1338 fiber->prev = fiber_pool->fibers.tail;
1339 fiber_pool->fibers.tail = fiber;
1340
1341 if (fiber_pool->fibers.head == NULL)
1342 fiber_pool->fibers.head = fiber;
1343
1344 return next_fiber;
1345 }
1346
1347 ////////////////////////////////////////////////////////////////////////////////
1348 // Kills all fibers from the given one up to the end of the fiber list.
1349 //
_yr_re_fiber_kill_tail(RE_FIBER_LIST * fiber_list,RE_FIBER_POOL * fiber_pool,RE_FIBER * fiber)1350 static void _yr_re_fiber_kill_tail(
1351 RE_FIBER_LIST* fiber_list,
1352 RE_FIBER_POOL* fiber_pool,
1353 RE_FIBER* fiber)
1354 {
1355 RE_FIBER* prev_fiber = fiber->prev;
1356
1357 if (prev_fiber != NULL)
1358 prev_fiber->next = NULL;
1359
1360 fiber->prev = fiber_pool->fibers.tail;
1361
1362 if (fiber_pool->fibers.tail != NULL)
1363 fiber_pool->fibers.tail->next = fiber;
1364
1365 fiber_pool->fibers.tail = fiber_list->tail;
1366 fiber_list->tail = prev_fiber;
1367
1368 if (fiber_list->head == fiber)
1369 fiber_list->head = NULL;
1370
1371 if (fiber_pool->fibers.head == NULL)
1372 fiber_pool->fibers.head = fiber;
1373 }
1374
1375 ////////////////////////////////////////////////////////////////////////////////
1376 // Kills all fibers in the fiber list.
1377 //
_yr_re_fiber_kill_all(RE_FIBER_LIST * fiber_list,RE_FIBER_POOL * fiber_pool)1378 static void _yr_re_fiber_kill_all(
1379 RE_FIBER_LIST* fiber_list,
1380 RE_FIBER_POOL* fiber_pool)
1381 {
1382 if (fiber_list->head != NULL)
1383 _yr_re_fiber_kill_tail(fiber_list, fiber_pool, fiber_list->head);
1384 }
1385
1386 ////////////////////////////////////////////////////////////////////////////////
1387 // Executes a fiber until reaching an "matching" instruction. A "matching"
1388 // instruction is one that actually reads a byte from the input and performs
1389 // some matching. If the fiber reaches a split instruction, the new fiber is
1390 // also synced.
1391 //
_yr_re_fiber_sync(RE_FIBER_LIST * fiber_list,RE_FIBER_POOL * fiber_pool,RE_FIBER * fiber_to_sync)1392 static int _yr_re_fiber_sync(
1393 RE_FIBER_LIST* fiber_list,
1394 RE_FIBER_POOL* fiber_pool,
1395 RE_FIBER* fiber_to_sync)
1396 {
1397 // A array for keeping track of which split instructions has been already
1398 // executed. Each split instruction within a regexp has an associated ID
1399 // between 0 and RE_MAX_SPLIT_ID-1. Keeping track of executed splits is
1400 // required to avoid infinite loops in regexps like (a*)* or (a|)*
1401
1402 RE_SPLIT_ID_TYPE splits_executed[RE_MAX_SPLIT_ID];
1403 RE_SPLIT_ID_TYPE splits_executed_count = 0;
1404 RE_SPLIT_ID_TYPE split_id, splits_executed_idx;
1405
1406 int split_already_executed;
1407
1408 RE_REPEAT_ARGS* repeat_args;
1409 RE_REPEAT_ANY_ARGS* repeat_any_args;
1410
1411 RE_FIBER* fiber;
1412 RE_FIBER* last;
1413 RE_FIBER* next;
1414 RE_FIBER* branch_a;
1415 RE_FIBER* branch_b;
1416
1417 fiber = fiber_to_sync;
1418 last = fiber_to_sync->next;
1419
1420 while (fiber != last)
1421 {
1422 uint8_t opcode = *fiber->ip;
1423
1424 switch (opcode)
1425 {
1426 case RE_OPCODE_SPLIT_A:
1427 case RE_OPCODE_SPLIT_B:
1428
1429 split_id = *(RE_SPLIT_ID_TYPE*) (fiber->ip + 1);
1430 split_already_executed = false;
1431
1432 for (splits_executed_idx = 0; splits_executed_idx < splits_executed_count;
1433 splits_executed_idx++)
1434 {
1435 if (split_id == splits_executed[splits_executed_idx])
1436 {
1437 split_already_executed = true;
1438 break;
1439 }
1440 }
1441
1442 if (split_already_executed)
1443 {
1444 fiber = _yr_re_fiber_kill(fiber_list, fiber_pool, fiber);
1445 }
1446 else
1447 {
1448 branch_a = fiber;
1449
1450 FAIL_ON_ERROR(
1451 _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1452
1453 // With RE_OPCODE_SPLIT_A the current fiber continues at the next
1454 // instruction in the stream (branch A), while the newly created
1455 // fiber starts at the address indicated by the instruction (branch B)
1456 // RE_OPCODE_SPLIT_B has the opposite behavior.
1457
1458 if (opcode == RE_OPCODE_SPLIT_B)
1459 yr_swap(branch_a, branch_b, RE_FIBER*);
1460
1461 // Branch A continues at the next instruction
1462 branch_a->ip += (sizeof(RE_SPLIT_ID_TYPE) + 3);
1463
1464 // Branch B adds the offset encoded in the opcode to its instruction
1465 // pointer.
1466 branch_b->ip += *(int16_t*)(
1467 branch_b->ip
1468 + 1 // opcode size
1469 + sizeof(RE_SPLIT_ID_TYPE));
1470
1471 #ifdef YR_PARANOID_MODE
1472 // In normal conditions this should never happen. But with compiled
1473 // rules that has been hand-crafted by a malicious actor this could
1474 // happen.
1475 if (splits_executed_count >= RE_MAX_SPLIT_ID)
1476 return ERROR_INTERNAL_FATAL_ERROR;
1477 #endif
1478
1479 splits_executed[splits_executed_count] = split_id;
1480 splits_executed_count++;
1481 }
1482
1483 break;
1484
1485 case RE_OPCODE_REPEAT_START_GREEDY:
1486 case RE_OPCODE_REPEAT_START_UNGREEDY:
1487
1488 repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1);
1489 assert(repeat_args->max > 0);
1490 branch_a = fiber;
1491
1492 if (repeat_args->min == 0)
1493 {
1494 FAIL_ON_ERROR(
1495 _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1496
1497 if (opcode == RE_OPCODE_REPEAT_START_UNGREEDY)
1498 yr_swap(branch_a, branch_b, RE_FIBER*);
1499
1500 branch_b->ip += repeat_args->offset;
1501 }
1502
1503 branch_a->stack[++branch_a->sp] = 0;
1504 branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS));
1505 break;
1506
1507 case RE_OPCODE_REPEAT_END_GREEDY:
1508 case RE_OPCODE_REPEAT_END_UNGREEDY:
1509
1510 repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1);
1511 fiber->stack[fiber->sp]++;
1512
1513 if (fiber->stack[fiber->sp] < repeat_args->min)
1514 {
1515 fiber->ip += repeat_args->offset;
1516 break;
1517 }
1518
1519 branch_a = fiber;
1520
1521 if (fiber->stack[fiber->sp] < repeat_args->max)
1522 {
1523 FAIL_ON_ERROR(
1524 _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1525
1526 if (opcode == RE_OPCODE_REPEAT_END_GREEDY)
1527 yr_swap(branch_a, branch_b, RE_FIBER*);
1528
1529 branch_b->ip += repeat_args->offset;
1530 }
1531
1532 branch_a->sp--;
1533 branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS));
1534 break;
1535
1536 case RE_OPCODE_REPEAT_ANY_GREEDY:
1537 case RE_OPCODE_REPEAT_ANY_UNGREEDY:
1538
1539 repeat_any_args = (RE_REPEAT_ANY_ARGS*) (fiber->ip + 1);
1540
1541 // If repetition counter (rc) is -1 it means that we are reaching this
1542 // instruction from the previous one in the instructions stream. In
1543 // this case let's initialize the counter to 0 and start looping.
1544
1545 if (fiber->rc == -1)
1546 fiber->rc = 0;
1547
1548 if (fiber->rc < repeat_any_args->min)
1549 {
1550 // Increase repetition counter and continue with next fiber. The
1551 // instruction pointer for this fiber is not incremented yet, this
1552 // fiber spins in this same instruction until reaching the minimum
1553 // number of repetitions.
1554
1555 fiber->rc++;
1556 fiber = fiber->next;
1557 }
1558 else if (fiber->rc < repeat_any_args->max)
1559 {
1560 // Once the minimum number of repetitions are matched one fiber
1561 // remains spinning in this instruction until reaching the maximum
1562 // number of repetitions while new fibers are created. New fibers
1563 // start executing at the next instruction.
1564
1565 next = fiber->next;
1566 branch_a = fiber;
1567
1568 FAIL_ON_ERROR(
1569 _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1570
1571 if (opcode == RE_OPCODE_REPEAT_ANY_UNGREEDY)
1572 yr_swap(branch_a, branch_b, RE_FIBER*);
1573
1574 branch_a->rc++;
1575 branch_b->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS));
1576 branch_b->rc = -1;
1577
1578 FAIL_ON_ERROR(_yr_re_fiber_sync(fiber_list, fiber_pool, branch_b));
1579
1580 fiber = next;
1581 }
1582 else
1583 {
1584 // When the maximum number of repetitions is reached the fiber keeps
1585 // executing at the next instruction. The repetition counter is set
1586 // to -1 indicating that we are not spinning in a repeat instruction
1587 // anymore.
1588
1589 fiber->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS));
1590 fiber->rc = -1;
1591 }
1592
1593 break;
1594
1595 case RE_OPCODE_JUMP:
1596 fiber->ip += *(int16_t*) (fiber->ip + 1);
1597 break;
1598
1599 default:
1600 fiber = fiber->next;
1601 }
1602 }
1603
1604 return ERROR_SUCCESS;
1605 }
1606
1607 ////////////////////////////////////////////////////////////////////////////////
1608 // Executes a regular expression. The specified regular expression will try to
1609 // match the data starting at the address specified by "input". The "input"
1610 // pointer can point to any address inside a memory buffer. Arguments
1611 // "input_forwards_size" and "input_backwards_size" indicate how many bytes
1612 // can be accessible starting at "input" and going forwards and backwards
1613 // respectively.
1614 //
1615 // <--- input_backwards_size -->|<----------- input_forwards_size -------->
1616 // |-------- memory buffer -----------------------------------------------|
1617 // ^
1618 // input
1619 //
1620 // Args:
1621 // YR_SCAN_CONTEXT *context - Scan context.
1622 // const uint8_t* code - Regexp code be executed
1623 // const uint8_t* input - Pointer to input data
1624 // size_t input_forwards_size - Number of accessible bytes starting at
1625 // "input" and going forwards.
1626 // size_t input_backwards_size - Number of accessible bytes starting at
1627 // "input" and going backwards
1628 // int flags - Flags:
1629 // RE_FLAGS_SCAN
1630 // RE_FLAGS_BACKWARDS
1631 // RE_FLAGS_EXHAUSTIVE
1632 // RE_FLAGS_WIDE
1633 // RE_FLAGS_NO_CASE
1634 // RE_FLAGS_DOT_ALL
1635 // RE_MATCH_CALLBACK_FUNC callback - Callback function
1636 // void* callback_args - Callback argument
1637 // int* matches - Pointer to an integer receiving the
1638 // number of matching bytes. Notice that
1639 // 0 means a zero-length match, while no
1640 // matches is -1.
1641 // Returns:
1642 // ERROR_SUCCESS or any other error code.
1643
yr_re_exec(YR_SCAN_CONTEXT * context,const uint8_t * code,const uint8_t * input_data,size_t input_forwards_size,size_t input_backwards_size,int flags,RE_MATCH_CALLBACK_FUNC callback,void * callback_args,int * matches)1644 int yr_re_exec(
1645 YR_SCAN_CONTEXT* context,
1646 const uint8_t* code,
1647 const uint8_t* input_data,
1648 size_t input_forwards_size,
1649 size_t input_backwards_size,
1650 int flags,
1651 RE_MATCH_CALLBACK_FUNC callback,
1652 void* callback_args,
1653 int* matches)
1654 {
1655 const uint8_t* input;
1656 const uint8_t* ip;
1657
1658 uint8_t mask;
1659 uint8_t value;
1660 uint8_t character_size;
1661
1662 RE_FIBER_LIST fibers;
1663 RE_FIBER* fiber;
1664 RE_FIBER* next_fiber;
1665
1666 int bytes_matched;
1667 int max_bytes_matched;
1668
1669 int match;
1670 int input_incr;
1671 int kill;
1672 int action;
1673
1674 #define ACTION_NONE 0
1675 #define ACTION_CONTINUE 1
1676 #define ACTION_KILL 2
1677 #define ACTION_KILL_TAIL 3
1678
1679 #define prolog \
1680 { \
1681 if ((bytes_matched >= max_bytes_matched) || \
1682 (character_size == 2 && *(input + 1) != 0)) \
1683 { \
1684 action = ACTION_KILL; \
1685 break; \
1686 } \
1687 }
1688
1689 if (matches != NULL)
1690 *matches = -1;
1691
1692 if (flags & RE_FLAGS_WIDE)
1693 character_size = 2;
1694 else
1695 character_size = 1;
1696
1697 input = input_data;
1698 input_incr = character_size;
1699
1700 if (flags & RE_FLAGS_BACKWARDS)
1701 {
1702 max_bytes_matched = (int) yr_min(input_backwards_size, YR_RE_SCAN_LIMIT);
1703 input -= character_size;
1704 input_incr = -input_incr;
1705 }
1706 else
1707 {
1708 max_bytes_matched = (int) yr_min(input_forwards_size, YR_RE_SCAN_LIMIT);
1709 }
1710
1711 // Round down max_bytes_matched to a multiple of character_size, this way if
1712 // character_size is 2 and max_bytes_matched is odd we are ignoring the
1713 // extra byte which can't match anyways.
1714
1715 max_bytes_matched = max_bytes_matched - max_bytes_matched % character_size;
1716 bytes_matched = 0;
1717
1718 FAIL_ON_ERROR(_yr_re_fiber_create(&context->re_fiber_pool, &fiber));
1719
1720 fiber->ip = code;
1721 fibers.head = fiber;
1722 fibers.tail = fiber;
1723
1724 FAIL_ON_ERROR_WITH_CLEANUP(
1725 _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
1726 _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1727
1728 while (fibers.head != NULL)
1729 {
1730 fiber = fibers.head;
1731
1732 while (fiber != NULL)
1733 {
1734 next_fiber = fiber->next;
1735
1736 if (_yr_re_fiber_exists(&fibers, fiber, fiber->prev))
1737 _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber);
1738
1739 fiber = next_fiber;
1740 }
1741
1742 fiber = fibers.head;
1743
1744 while (fiber != NULL)
1745 {
1746 ip = fiber->ip;
1747 action = ACTION_NONE;
1748
1749 switch (*ip)
1750 {
1751 case RE_OPCODE_ANY:
1752 prolog;
1753 match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A);
1754 action = match ? ACTION_NONE : ACTION_KILL;
1755 fiber->ip += 1;
1756 break;
1757
1758 case RE_OPCODE_REPEAT_ANY_GREEDY:
1759 case RE_OPCODE_REPEAT_ANY_UNGREEDY:
1760 prolog;
1761 match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A);
1762 action = match ? ACTION_NONE : ACTION_KILL;
1763
1764 // The instruction pointer is not incremented here. The current fiber
1765 // spins in this instruction until reaching the required number of
1766 // repetitions. The code controlling the number of repetitions is in
1767 // _yr_re_fiber_sync.
1768
1769 break;
1770
1771 case RE_OPCODE_LITERAL:
1772 prolog;
1773 if (flags & RE_FLAGS_NO_CASE)
1774 match = yr_lowercase[*input] == yr_lowercase[*(ip + 1)];
1775 else
1776 match = (*input == *(ip + 1));
1777 action = match ? ACTION_NONE : ACTION_KILL;
1778 fiber->ip += 2;
1779 break;
1780
1781 case RE_OPCODE_MASKED_LITERAL:
1782 prolog;
1783 value = *(int16_t*) (ip + 1) & 0xFF;
1784 mask = *(int16_t*) (ip + 1) >> 8;
1785
1786 // We don't need to take into account the case-insensitive
1787 // case because this opcode is only used with hex strings,
1788 // which can't be case-insensitive.
1789
1790 match = ((*input & mask) == value);
1791 action = match ? ACTION_NONE : ACTION_KILL;
1792 fiber->ip += 3;
1793 break;
1794
1795 case RE_OPCODE_CLASS:
1796 prolog;
1797 match = _yr_re_is_char_in_class(
1798 (RE_CLASS*) (ip + 1), *input, flags & RE_FLAGS_NO_CASE);
1799 action = match ? ACTION_NONE : ACTION_KILL;
1800 fiber->ip += (sizeof(RE_CLASS) + 1);
1801 break;
1802
1803 case RE_OPCODE_WORD_CHAR:
1804 prolog;
1805 match = _yr_re_is_word_char(input, character_size);
1806 action = match ? ACTION_NONE : ACTION_KILL;
1807 fiber->ip += 1;
1808 break;
1809
1810 case RE_OPCODE_NON_WORD_CHAR:
1811 prolog;
1812 match = !_yr_re_is_word_char(input, character_size);
1813 action = match ? ACTION_NONE : ACTION_KILL;
1814 fiber->ip += 1;
1815 break;
1816
1817 case RE_OPCODE_SPACE:
1818 case RE_OPCODE_NON_SPACE:
1819
1820 prolog;
1821
1822 switch (*input)
1823 {
1824 case ' ':
1825 case '\t':
1826 case '\r':
1827 case '\n':
1828 case '\v':
1829 case '\f':
1830 match = true;
1831 break;
1832 default:
1833 match = false;
1834 }
1835
1836 if (*ip == RE_OPCODE_NON_SPACE)
1837 match = !match;
1838
1839 action = match ? ACTION_NONE : ACTION_KILL;
1840 fiber->ip += 1;
1841 break;
1842
1843 case RE_OPCODE_DIGIT:
1844 prolog;
1845 match = isdigit(*input);
1846 action = match ? ACTION_NONE : ACTION_KILL;
1847 fiber->ip += 1;
1848 break;
1849
1850 case RE_OPCODE_NON_DIGIT:
1851 prolog;
1852 match = !isdigit(*input);
1853 action = match ? ACTION_NONE : ACTION_KILL;
1854 fiber->ip += 1;
1855 break;
1856
1857 case RE_OPCODE_WORD_BOUNDARY:
1858 case RE_OPCODE_NON_WORD_BOUNDARY:
1859
1860 if (bytes_matched == 0 && input_backwards_size < character_size)
1861 {
1862 match = true;
1863 }
1864 else if (bytes_matched >= max_bytes_matched)
1865 {
1866 match = true;
1867 }
1868 else
1869 {
1870 assert(input < input_data + input_forwards_size);
1871 assert(input >= input_data - input_backwards_size);
1872
1873 assert(input - input_incr < input_data + input_forwards_size);
1874 assert(input - input_incr >= input_data - input_backwards_size);
1875
1876 match = _yr_re_is_word_char(input, character_size) !=
1877 _yr_re_is_word_char(input - input_incr, character_size);
1878 }
1879
1880 if (*ip == RE_OPCODE_NON_WORD_BOUNDARY)
1881 match = !match;
1882
1883 action = match ? ACTION_CONTINUE : ACTION_KILL;
1884 fiber->ip += 1;
1885 break;
1886
1887 case RE_OPCODE_MATCH_AT_START:
1888 if (flags & RE_FLAGS_BACKWARDS)
1889 kill = input_backwards_size > (size_t) bytes_matched;
1890 else
1891 kill = input_backwards_size > 0 || (bytes_matched != 0);
1892 action = kill ? ACTION_KILL : ACTION_CONTINUE;
1893 fiber->ip += 1;
1894 break;
1895
1896 case RE_OPCODE_MATCH_AT_END:
1897 kill = flags & RE_FLAGS_BACKWARDS ||
1898 input_forwards_size > (size_t) bytes_matched;
1899 action = kill ? ACTION_KILL : ACTION_CONTINUE;
1900 fiber->ip += 1;
1901 break;
1902
1903 case RE_OPCODE_MATCH:
1904
1905 if (matches != NULL)
1906 *matches = bytes_matched;
1907
1908 if (flags & RE_FLAGS_EXHAUSTIVE)
1909 {
1910 if (callback != NULL)
1911 {
1912 if (flags & RE_FLAGS_BACKWARDS)
1913 {
1914 FAIL_ON_ERROR_WITH_CLEANUP(
1915 callback(
1916 input + character_size,
1917 bytes_matched,
1918 flags,
1919 callback_args),
1920 _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1921 }
1922 else
1923 {
1924 FAIL_ON_ERROR_WITH_CLEANUP(
1925 callback(input_data, bytes_matched, flags, callback_args),
1926 _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1927 }
1928 }
1929
1930 action = ACTION_KILL;
1931 }
1932 else
1933 {
1934 action = ACTION_KILL_TAIL;
1935 }
1936
1937 break;
1938
1939 default:
1940 assert(false);
1941 }
1942
1943 switch (action)
1944 {
1945 case ACTION_KILL:
1946 fiber = _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber);
1947 break;
1948
1949 case ACTION_KILL_TAIL:
1950 _yr_re_fiber_kill_tail(&fibers, &context->re_fiber_pool, fiber);
1951 fiber = NULL;
1952 break;
1953
1954 case ACTION_CONTINUE:
1955 FAIL_ON_ERROR_WITH_CLEANUP(
1956 _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
1957 _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1958 break;
1959
1960 default:
1961 next_fiber = fiber->next;
1962 FAIL_ON_ERROR_WITH_CLEANUP(
1963 _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
1964 _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1965 fiber = next_fiber;
1966 }
1967 }
1968
1969 input += input_incr;
1970 bytes_matched += character_size;
1971
1972 if (flags & RE_FLAGS_SCAN && bytes_matched < max_bytes_matched)
1973 {
1974 FAIL_ON_ERROR_WITH_CLEANUP(
1975 _yr_re_fiber_create(&context->re_fiber_pool, &fiber),
1976 _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1977
1978 fiber->ip = code;
1979 _yr_re_fiber_append(&fibers, fiber);
1980
1981 FAIL_ON_ERROR_WITH_CLEANUP(
1982 _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
1983 _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1984 }
1985 }
1986
1987 return ERROR_SUCCESS;
1988 }
1989
1990 ////////////////////////////////////////////////////////////////////////////////
1991 // This function replaces yr_re_exec for regular expressions marked with flag
1992 // RE_FLAGS_FAST_REGEXP. These are regular expression whose code contain only
1993 // the following operations: RE_OPCODE_LITERAL, RE_OPCODE_MASKED_LITERAL,
1994 // RE_OPCODE_ANY, RE_OPCODE_REPEAT_ANY_UNGREEDY and RE_OPCODE_MATCH. Some
1995 // examples of regular expressions that can be executed with this function are:
1996 //
1997 // /foobar/
1998 // /foo.*?bar/
1999 //
yr_re_fast_exec(YR_SCAN_CONTEXT * context,const uint8_t * code,const uint8_t * input_data,size_t input_forwards_size,size_t input_backwards_size,int flags,RE_MATCH_CALLBACK_FUNC callback,void * callback_args,int * matches)2000 int yr_re_fast_exec(
2001 YR_SCAN_CONTEXT* context,
2002 const uint8_t* code,
2003 const uint8_t* input_data,
2004 size_t input_forwards_size,
2005 size_t input_backwards_size,
2006 int flags,
2007 RE_MATCH_CALLBACK_FUNC callback,
2008 void* callback_args,
2009 int* matches)
2010 {
2011 RE_REPEAT_ANY_ARGS* repeat_any_args;
2012
2013 const uint8_t* code_stack[YR_MAX_FAST_RE_STACK];
2014 const uint8_t* input_stack[YR_MAX_FAST_RE_STACK];
2015 int matches_stack[YR_MAX_FAST_RE_STACK];
2016
2017 const uint8_t* input = input_data;
2018 const uint8_t* next_input;
2019 const uint8_t* ip = code;
2020 const uint8_t* next_opcode;
2021
2022 uint8_t mask;
2023 uint8_t value;
2024
2025 int stop;
2026 int input_incr;
2027 int sp = 0;
2028
2029 int bytes_matched;
2030 int max_bytes_matched;
2031
2032 max_bytes_matched = flags & RE_FLAGS_BACKWARDS ? (int) input_backwards_size
2033 : (int) input_forwards_size;
2034
2035 input_incr = flags & RE_FLAGS_BACKWARDS ? -1 : 1;
2036
2037 if (flags & RE_FLAGS_BACKWARDS)
2038 input--;
2039
2040 code_stack[sp] = code;
2041 input_stack[sp] = input;
2042 matches_stack[sp] = 0;
2043 sp++;
2044
2045 while (sp > 0)
2046 {
2047 sp--;
2048 ip = code_stack[sp];
2049 input = input_stack[sp];
2050 bytes_matched = matches_stack[sp];
2051 stop = false;
2052
2053 while (!stop)
2054 {
2055 if (*ip == RE_OPCODE_MATCH)
2056 {
2057 if (flags & RE_FLAGS_EXHAUSTIVE)
2058 {
2059 FAIL_ON_ERROR(callback(
2060 // When matching forwards the matching data always starts at
2061 // input_data, when matching backwards it starts at input + 1 or
2062 // input_data - input_backwards_size if input + 1 is outside the
2063 // buffer.
2064 flags & RE_FLAGS_BACKWARDS
2065 ? yr_max(input + 1, input_data - input_backwards_size)
2066 : input_data,
2067 // The number of matched bytes should not be larger than
2068 // max_bytes_matched.
2069 yr_min(bytes_matched, max_bytes_matched),
2070 flags,
2071 callback_args));
2072
2073 break;
2074 }
2075 else
2076 {
2077 if (matches != NULL)
2078 *matches = bytes_matched;
2079
2080 return ERROR_SUCCESS;
2081 }
2082 }
2083
2084 if (bytes_matched >= max_bytes_matched)
2085 break;
2086
2087 switch (*ip)
2088 {
2089 case RE_OPCODE_LITERAL:
2090
2091 if (*input == *(ip + 1))
2092 {
2093 bytes_matched++;
2094 input += input_incr;
2095 ip += 2;
2096 }
2097 else
2098 {
2099 stop = true;
2100 }
2101
2102 break;
2103
2104 case RE_OPCODE_MASKED_LITERAL:
2105
2106 value = *(int16_t*) (ip + 1) & 0xFF;
2107 mask = *(int16_t*) (ip + 1) >> 8;
2108
2109 if ((*input & mask) == value)
2110 {
2111 bytes_matched++;
2112 input += input_incr;
2113 ip += 3;
2114 }
2115 else
2116 {
2117 stop = true;
2118 }
2119
2120 break;
2121
2122 case RE_OPCODE_ANY:
2123
2124 bytes_matched++;
2125 input += input_incr;
2126 ip += 1;
2127
2128 break;
2129
2130 case RE_OPCODE_REPEAT_ANY_UNGREEDY:
2131
2132 repeat_any_args = (RE_REPEAT_ANY_ARGS*) (ip + 1);
2133 next_opcode = ip + 1 + sizeof(RE_REPEAT_ANY_ARGS);
2134
2135 for (int i = repeat_any_args->min + 1; i <= repeat_any_args->max; i++)
2136 {
2137 if (bytes_matched + i >= max_bytes_matched)
2138 break;
2139
2140 next_input = input + i * input_incr;
2141
2142 if (*(next_opcode) != RE_OPCODE_LITERAL ||
2143 (*(next_opcode) == RE_OPCODE_LITERAL &&
2144 *(next_opcode + 1) == *next_input))
2145 {
2146 if (sp >= YR_MAX_FAST_RE_STACK)
2147 return ERROR_TOO_MANY_RE_FIBERS;
2148
2149 code_stack[sp] = next_opcode;
2150 input_stack[sp] = next_input;
2151 matches_stack[sp] = bytes_matched + i;
2152 sp++;
2153 }
2154 }
2155
2156 input += input_incr * repeat_any_args->min;
2157 bytes_matched += repeat_any_args->min;
2158
2159 ip = next_opcode;
2160
2161 break;
2162
2163 default:
2164 assert(false);
2165 }
2166 }
2167 }
2168
2169 if (matches != NULL)
2170 *matches = -1;
2171
2172 return ERROR_SUCCESS;
2173 }
2174
_yr_re_print_node(RE_NODE * re_node,uint32_t indent)2175 static void _yr_re_print_node(RE_NODE* re_node, uint32_t indent)
2176 {
2177 RE_NODE* child;
2178 int i;
2179
2180 if (re_node == NULL)
2181 return;
2182
2183 if (indent > 0)
2184 printf("\n%*s", indent, " ");
2185 switch (re_node->type)
2186 {
2187 case RE_NODE_ALT:
2188 printf("Alt(");
2189 _yr_re_print_node(re_node->children_head, indent + 4);
2190 printf(",");
2191 _yr_re_print_node(re_node->children_tail, indent + 4);
2192 printf("\n%*s%s", indent, " ", ")");
2193 break;
2194
2195 case RE_NODE_CONCAT:
2196 printf("Cat(");
2197 child = re_node->children_head;
2198 while (child != NULL)
2199 {
2200 _yr_re_print_node(child, indent + 4);
2201 printf(",");
2202 child = child->next_sibling;
2203 }
2204 printf("\n%*s%s", indent, " ", ")");
2205 break;
2206
2207 case RE_NODE_STAR:
2208 printf("Star(");
2209 _yr_re_print_node(re_node->children_head, indent + 4);
2210 printf(")");
2211 break;
2212
2213 case RE_NODE_PLUS:
2214 printf("Plus(");
2215 _yr_re_print_node(re_node->children_head, indent + 4);
2216 printf(")");
2217 break;
2218
2219 case RE_NODE_LITERAL:
2220 printf("Lit(%c)", re_node->value);
2221 break;
2222
2223 case RE_NODE_MASKED_LITERAL:
2224 printf("MaskedLit(%02X,%02X)", re_node->value, re_node->mask);
2225 break;
2226
2227 case RE_NODE_WORD_CHAR:
2228 printf("WordChar");
2229 break;
2230
2231 case RE_NODE_NON_WORD_CHAR:
2232 printf("NonWordChar");
2233 break;
2234
2235 case RE_NODE_SPACE:
2236 printf("Space");
2237 break;
2238
2239 case RE_NODE_NON_SPACE:
2240 printf("NonSpace");
2241 break;
2242
2243 case RE_NODE_DIGIT:
2244 printf("Digit");
2245 break;
2246
2247 case RE_NODE_NON_DIGIT:
2248 printf("NonDigit");
2249 break;
2250
2251 case RE_NODE_ANY:
2252 printf("Any");
2253 break;
2254
2255 case RE_NODE_RANGE:
2256 printf("Range(%d-%d, ", re_node->start, re_node->end);
2257 _yr_re_print_node(re_node->children_head, indent + 4);
2258 printf("\n%*s%s", indent, " ", ")");
2259 break;
2260
2261 case RE_NODE_CLASS:
2262 printf("Class(");
2263 for (i = 0; i < 256; i++)
2264 if (_yr_re_is_char_in_class(re_node->re_class, i, false))
2265 printf("%02X,", i);
2266 printf(")");
2267 break;
2268
2269 default:
2270 printf("???");
2271 break;
2272 }
2273 }
2274
yr_re_print(RE_AST * re_ast)2275 void yr_re_print(RE_AST* re_ast)
2276 {
2277 _yr_re_print_node(re_ast->root_node, 0);
2278 }
2279