1 /*
2 Copyright (c) 2013. The YARA Authors. All Rights Reserved.
3 
4 Redistribution and use in source and binary forms, with or without modification,
5 are permitted provided that the following conditions are met:
6 
7 1. Redistributions of source code must retain the above copyright notice, this
8 list of conditions and the following disclaimer.
9 
10 2. Redistributions in binary form must reproduce the above copyright notice,
11 this list of conditions and the following disclaimer in the documentation and/or
12 other materials provided with the distribution.
13 
14 3. Neither the name of the copyright holder nor the names of its contributors
15 may be used to endorse or promote products derived from this software without
16 specific prior written permission.
17 
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29 
30 /*
31 
32 This module implements a regular expressions engine based on Thompson's
33 algorithm as described by Russ Cox in http://swtch.com/~rsc/regexp/regexp2.html.
34 
35 What the article names a "thread" has been named a "fiber" in this code, in
36 order to avoid confusion with operating system threads.
37 
38 */
39 
40 #include <assert.h>
41 #include <string.h>
42 #include <yara/compiler.h>
43 #include <yara/error.h>
44 #include <yara/globals.h>
45 #include <yara/hex_lexer.h>
46 #include <yara/limits.h>
47 #include <yara/mem.h>
48 #include <yara/re.h>
49 #include <yara/re_lexer.h>
50 #include <yara/threading.h>
51 #include <yara/utils.h>
52 
53 #define EMIT_BACKWARDS               0x01
54 #define EMIT_DONT_SET_FORWARDS_CODE  0x02
55 #define EMIT_DONT_SET_BACKWARDS_CODE 0x04
56 
57 #ifndef INT16_MAX
58 #define INT16_MAX (32767)
59 #endif
60 
61 typedef uint8_t RE_SPLIT_ID_TYPE;
62 
63 typedef struct _RE_REPEAT_ARGS
64 {
65   uint16_t min;
66   uint16_t max;
67   int32_t offset;
68 
69 } RE_REPEAT_ARGS;
70 
71 typedef struct _RE_REPEAT_ANY_ARGS
72 {
73   uint16_t min;
74   uint16_t max;
75 
76 } RE_REPEAT_ANY_ARGS;
77 
78 typedef struct _RE_EMIT_CONTEXT
79 {
80   YR_ARENA* arena;
81   RE_SPLIT_ID_TYPE next_split_id;
82 
83 } RE_EMIT_CONTEXT;
84 
85 #define CHAR_IN_CLASS(cls, chr) ((cls)[(chr) / 8] & 1 << ((chr) % 8))
86 
_yr_re_is_char_in_class(RE_CLASS * re_class,uint8_t chr,int case_insensitive)87 static bool _yr_re_is_char_in_class(
88     RE_CLASS* re_class,
89     uint8_t chr,
90     int case_insensitive)
91 {
92   int result = CHAR_IN_CLASS(re_class->bitmap, chr);
93 
94   if (case_insensitive)
95     result |= CHAR_IN_CLASS(re_class->bitmap, yr_altercase[chr]);
96 
97   if (re_class->negated)
98     result = !result;
99 
100   return result;
101 }
102 
_yr_re_is_word_char(const uint8_t * input,uint8_t character_size)103 static bool _yr_re_is_word_char(const uint8_t* input, uint8_t character_size)
104 {
105   int result = ((isalnum(*input) || (*input) == '_'));
106 
107   if (character_size == 2)
108     result = result && (*(input + 1) == 0);
109 
110   return result;
111 }
112 
yr_re_node_create(int type)113 RE_NODE* yr_re_node_create(int type)
114 {
115   RE_NODE* result = (RE_NODE*) yr_malloc(sizeof(RE_NODE));
116 
117   if (result != NULL)
118   {
119     result->type = type;
120     result->children_head = NULL;
121     result->children_tail = NULL;
122     result->prev_sibling = NULL;
123     result->next_sibling = NULL;
124     result->greedy = true;
125     result->forward_code_ref = YR_ARENA_NULL_REF;
126     result->backward_code_ref = YR_ARENA_NULL_REF;
127   }
128 
129   return result;
130 }
131 
yr_re_node_destroy(RE_NODE * node)132 void yr_re_node_destroy(RE_NODE* node)
133 {
134   RE_NODE* child = node->children_head;
135   RE_NODE* next_child;
136 
137   while (child != NULL)
138   {
139     next_child = child->next_sibling;
140     yr_re_node_destroy(child);
141     child = next_child;
142   }
143 
144   if (node->type == RE_NODE_CLASS)
145     yr_free(node->re_class);
146 
147   yr_free(node);
148 }
149 
150 ////////////////////////////////////////////////////////////////////////////////
151 // Appends a node to the end of the children list.
152 //
yr_re_node_append_child(RE_NODE * node,RE_NODE * child)153 void yr_re_node_append_child(RE_NODE* node, RE_NODE* child)
154 {
155   if (node->children_head == NULL)
156     node->children_head = child;
157 
158   if (node->children_tail != NULL)
159     node->children_tail->next_sibling = child;
160 
161   child->prev_sibling = node->children_tail;
162   node->children_tail = child;
163 }
164 
165 ////////////////////////////////////////////////////////////////////////////////
166 // Appends a node to the beginning of the children list.
167 //
yr_re_node_prepend_child(RE_NODE * node,RE_NODE * child)168 void yr_re_node_prepend_child(RE_NODE* node, RE_NODE* child)
169 {
170   child->next_sibling = node->children_head;
171 
172   if (node->children_head != NULL)
173     node->children_head->prev_sibling = child;
174 
175   node->children_head = child;
176 
177   if (node->children_tail == NULL)
178     node->children_tail = child;
179 }
180 
yr_re_ast_create(RE_AST ** re_ast)181 int yr_re_ast_create(RE_AST** re_ast)
182 {
183   *re_ast = (RE_AST*) yr_malloc(sizeof(RE_AST));
184 
185   if (*re_ast == NULL)
186     return ERROR_INSUFFICIENT_MEMORY;
187 
188   (*re_ast)->flags = 0;
189   (*re_ast)->root_node = NULL;
190 
191   return ERROR_SUCCESS;
192 }
193 
yr_re_ast_destroy(RE_AST * re_ast)194 void yr_re_ast_destroy(RE_AST* re_ast)
195 {
196   if (re_ast->root_node != NULL)
197     yr_re_node_destroy(re_ast->root_node);
198 
199   yr_free(re_ast);
200 }
201 
202 ////////////////////////////////////////////////////////////////////////////////
203 // Parses a regexp but don't emit its code. A further call to
204 // yr_re_ast_emit_code is required to get the code.
205 //
yr_re_parse(const char * re_string,RE_AST ** re_ast,RE_ERROR * error)206 int yr_re_parse(const char* re_string, RE_AST** re_ast, RE_ERROR* error)
207 {
208   return yr_parse_re_string(re_string, re_ast, error);
209 }
210 
211 ////////////////////////////////////////////////////////////////////////////////
212 // Parses a hex string but don't emit its code. A further call to
213 // yr_re_ast_emit_code is required to get the code.
214 //
yr_re_parse_hex(const char * hex_string,RE_AST ** re_ast,RE_ERROR * error)215 int yr_re_parse_hex(const char* hex_string, RE_AST** re_ast, RE_ERROR* error)
216 {
217   return yr_parse_hex_string(hex_string, re_ast, error);
218 }
219 
220 ////////////////////////////////////////////////////////////////////////////////
221 // Parses the regexp and emit its code to the provided to the
222 // YR_RE_CODE_SECTION in the specified arena.
223 //
yr_re_compile(const char * re_string,int flags,YR_ARENA * arena,YR_ARENA_REF * ref,RE_ERROR * error)224 int yr_re_compile(
225     const char* re_string,
226     int flags,
227     YR_ARENA* arena,
228     YR_ARENA_REF* ref,
229     RE_ERROR* error)
230 {
231   RE_AST* re_ast;
232   RE _re;
233 
234   FAIL_ON_ERROR(yr_re_parse(re_string, &re_ast, error));
235 
236   _re.flags = flags;
237 
238   FAIL_ON_ERROR_WITH_CLEANUP(
239       yr_arena_write_data(arena, YR_RE_CODE_SECTION, &_re, sizeof(_re), ref),
240       yr_re_ast_destroy(re_ast));
241 
242   FAIL_ON_ERROR_WITH_CLEANUP(
243       yr_re_ast_emit_code(re_ast, arena, false), yr_re_ast_destroy(re_ast));
244 
245   yr_re_ast_destroy(re_ast);
246 
247   return ERROR_SUCCESS;
248 }
249 
250 ////////////////////////////////////////////////////////////////////////////////
251 // Verifies if the target string matches the pattern
252 //
253 // Args:
254 //    context: Scan context
255 //    re: A pointer to a compiled regexp
256 //    target: Target string
257 //
258 // Returns:
259 //    See return codes for yr_re_exec
260 //
yr_re_match(YR_SCAN_CONTEXT * context,RE * re,const char * target)261 int yr_re_match(YR_SCAN_CONTEXT* context, RE* re, const char* target)
262 {
263   int result;
264 
265   yr_re_exec(
266       context,
267       re->code,
268       (uint8_t*) target,
269       strlen(target),
270       0,
271       re->flags | RE_FLAGS_SCAN,
272       NULL,
273       NULL,
274       &result);
275 
276   return result;
277 }
278 
279 ////////////////////////////////////////////////////////////////////////////////
280 // Verifies if the provided regular expression is just a literal string
281 // like "abc", "12345", without any wildcard, operator, etc. In that case
282 // returns the string as a SIZED_STRING, or returns NULL if otherwise.
283 //
284 // The caller is responsible for deallocating the returned SIZED_STRING by
285 // calling yr_free.
286 //
yr_re_ast_extract_literal(RE_AST * re_ast)287 SIZED_STRING* yr_re_ast_extract_literal(RE_AST* re_ast)
288 {
289   SIZED_STRING* string;
290   RE_NODE* child;
291 
292   int length = 0;
293 
294   if (re_ast->root_node->type == RE_NODE_LITERAL)
295   {
296     length = 1;
297   }
298   else if (re_ast->root_node->type == RE_NODE_CONCAT)
299   {
300     child = re_ast->root_node->children_tail;
301 
302     while (child != NULL && child->type == RE_NODE_LITERAL)
303     {
304       length++;
305       child = child->prev_sibling;
306     }
307 
308     if (child != NULL)
309       return NULL;
310   }
311   else
312   {
313     return NULL;
314   }
315 
316   string = (SIZED_STRING*) yr_malloc(sizeof(SIZED_STRING) + length);
317 
318   if (string == NULL)
319     return NULL;
320 
321   string->length = length;
322   string->flags = 0;
323 
324   if (re_ast->root_node->type == RE_NODE_LITERAL)
325   {
326     string->c_string[0] = re_ast->root_node->value;
327   }
328   else
329   {
330     child = re_ast->root_node->children_tail;
331 
332     while (child != NULL)
333     {
334       string->c_string[--length] = child->value;
335       child = child->prev_sibling;
336     }
337   }
338 
339   string->c_string[string->length] = '\0';
340 
341   return string;
342 }
343 
_yr_re_node_has_unbounded_quantifier_for_dot(RE_NODE * re_node)344 int _yr_re_node_has_unbounded_quantifier_for_dot(RE_NODE* re_node)
345 {
346   RE_NODE* child;
347 
348   if ((re_node->type == RE_NODE_STAR || re_node->type == RE_NODE_PLUS) &&
349       re_node->children_head->type == RE_NODE_ANY)
350     return true;
351 
352   if (re_node->type == RE_NODE_RANGE_ANY && re_node->end == RE_MAX_RANGE)
353     return true;
354 
355   if (re_node->type == RE_NODE_CONCAT)
356   {
357     child = re_node->children_tail;
358 
359     while (child != NULL)
360     {
361       if (_yr_re_node_has_unbounded_quantifier_for_dot(child))
362         return true;
363 
364       child = child->prev_sibling;
365     }
366   }
367 
368   return false;
369 }
370 
371 ////////////////////////////////////////////////////////////////////////////////
372 // Detects the use of .*, .+ or .{x,} in a regexp. The use of wildcards with
373 // quantifiers that don't have a reasonably small upper bound causes a
374 // performance penalty. This function dectects such cases in order to warn the
375 // user about this.
376 //
yr_re_ast_has_unbounded_quantifier_for_dot(RE_AST * re_ast)377 int yr_re_ast_has_unbounded_quantifier_for_dot(RE_AST* re_ast)
378 {
379   return _yr_re_node_has_unbounded_quantifier_for_dot(re_ast->root_node);
380 }
381 
382 ////////////////////////////////////////////////////////////////////////////////
383 // In some cases splitting a regular expression (or hex string) in two parts is
384 // convenient for increasing performance. This happens when the pattern contains
385 // a large gap (a.k.a jump), for example: { 01 02 03 [0-999] 04 05 06 }
386 // In this case the string is splitted in { 01 02 03 } and { 04 05 06 } where
387 // the latter is chained to the former. This means that { 01 02 03 } and
388 // { 04 05 06 } are handled as individual strings, and when both of them are
389 // found, YARA verifies if the distance between the matches complies with the
390 // [0-999] restriction.
391 //
392 // This function traverses a regexp's AST looking for nodes where it should be
393 // splitted. It must be noticed that this only applies to two-level ASTs (i.e.
394 // an AST consisting in a RE_NODE_CONCAT at the root where all the children are
395 // leaves).
396 //
397 // For example, { 01 02 03 [0-1000] 04 05 06 [500-2000] 07 08 09 } has the
398 // following AST:
399 //
400 // RE_NODE_CONCAT
401 // |
402 // |- RE_NODE_LITERAL (01)
403 // |- RE_NODE_LITERAL (02)
404 // |- RE_NODE_LITERAL (03)
405 // |- RE_NODE_RANGE_ANY (start=0, end=1000)
406 // |- RE_NODE_LITERAL (04)
407 // |- RE_NODE_LITERAL (05)
408 // |- RE_NODE_LITERAL (06)
409 // |- RE_NODE_RANGE_ANY (start=500, end=2000)
410 // |- RE_NODE_LITERAL (07)
411 // |- RE_NODE_LITERAL (08)
412 // |- RE_NODE_LITERAL (09)
413 //
414 // If the AST above is passed in the re_ast argument, it will be trimmed to:
415 //
416 // RE_NODE_CONCAT
417 // |
418 // |- RE_NODE_LITERAL (01)
419 // |- RE_NODE_LITERAL (02)
420 // |- RE_NODE_LITERAL (03)
421 //
422 // While remainder_re_ast will be:
423 //
424 // RE_NODE_CONCAT
425 // |
426 // |- RE_NODE_LITERAL (04)
427 // |- RE_NODE_LITERAL (05)
428 // |- RE_NODE_LITERAL (06)
429 // |- RE_NODE_RANGE_ANY (start=500, end=2000)
430 // |- RE_NODE_LITERAL (07)
431 // |- RE_NODE_LITERAL (08)
432 // |- RE_NODE_LITERAL (09)
433 //
434 // The caller is responsible for freeing the new AST in remainder_re_ast by
435 // calling yr_re_ast_destroy.
436 //
437 // The integers pointed to by min_gap and max_gap will be filled with the
438 // minimum and maximum gap size between the sub-strings represented by the
439 // two ASTs.
440 //
yr_re_ast_split_at_chaining_point(RE_AST * re_ast,RE_AST ** remainder_re_ast,int32_t * min_gap,int32_t * max_gap)441 int yr_re_ast_split_at_chaining_point(
442     RE_AST* re_ast,
443     RE_AST** remainder_re_ast,
444     int32_t* min_gap,
445     int32_t* max_gap)
446 {
447   RE_NODE* child;
448   RE_NODE* concat;
449 
450   int result;
451 
452   *remainder_re_ast = NULL;
453   *min_gap = 0;
454   *max_gap = 0;
455 
456   if (re_ast->root_node->type != RE_NODE_CONCAT)
457     return ERROR_SUCCESS;
458 
459   child = re_ast->root_node->children_head;
460 
461   while (child != NULL)
462   {
463     if (!child->greedy && child->type == RE_NODE_RANGE_ANY &&
464         child->prev_sibling != NULL && child->next_sibling != NULL &&
465         (child->start > YR_STRING_CHAINING_THRESHOLD ||
466          child->end > YR_STRING_CHAINING_THRESHOLD))
467     {
468       result = yr_re_ast_create(remainder_re_ast);
469 
470       if (result != ERROR_SUCCESS)
471         return result;
472 
473       concat = yr_re_node_create(RE_NODE_CONCAT);
474 
475       if (concat == NULL)
476         return ERROR_INSUFFICIENT_MEMORY;
477 
478       concat->children_head = child->next_sibling;
479       concat->children_tail = re_ast->root_node->children_tail;
480 
481       re_ast->root_node->children_tail = child->prev_sibling;
482 
483       child->prev_sibling->next_sibling = NULL;
484       child->next_sibling->prev_sibling = NULL;
485 
486       *min_gap = child->start;
487       *max_gap = child->end;
488 
489       (*remainder_re_ast)->root_node = concat;
490       (*remainder_re_ast)->flags = re_ast->flags;
491 
492       yr_re_node_destroy(child);
493 
494       return ERROR_SUCCESS;
495     }
496 
497     child = child->next_sibling;
498   }
499 
500   return ERROR_SUCCESS;
501 }
502 
_yr_emit_inst(RE_EMIT_CONTEXT * emit_context,uint8_t opcode,YR_ARENA_REF * instruction_ref)503 int _yr_emit_inst(
504     RE_EMIT_CONTEXT* emit_context,
505     uint8_t opcode,
506     YR_ARENA_REF* instruction_ref)
507 {
508   FAIL_ON_ERROR(yr_arena_write_data(
509       emit_context->arena,
510       YR_RE_CODE_SECTION,
511       &opcode,
512       sizeof(uint8_t),
513       instruction_ref));
514 
515   return ERROR_SUCCESS;
516 }
517 
_yr_emit_inst_arg_uint8(RE_EMIT_CONTEXT * emit_context,uint8_t opcode,uint8_t argument,YR_ARENA_REF * instruction_ref,YR_ARENA_REF * argument_ref)518 int _yr_emit_inst_arg_uint8(
519     RE_EMIT_CONTEXT* emit_context,
520     uint8_t opcode,
521     uint8_t argument,
522     YR_ARENA_REF* instruction_ref,
523     YR_ARENA_REF* argument_ref)
524 {
525   FAIL_ON_ERROR(yr_arena_write_data(
526       emit_context->arena,
527       YR_RE_CODE_SECTION,
528       &opcode,
529       sizeof(uint8_t),
530       instruction_ref));
531 
532   FAIL_ON_ERROR(yr_arena_write_data(
533       emit_context->arena,
534       YR_RE_CODE_SECTION,
535       &argument,
536       sizeof(uint8_t),
537       argument_ref));
538 
539   return ERROR_SUCCESS;
540 }
541 
_yr_emit_inst_arg_uint16(RE_EMIT_CONTEXT * emit_context,uint8_t opcode,uint16_t argument,YR_ARENA_REF * instruction_ref,YR_ARENA_REF * argument_ref)542 int _yr_emit_inst_arg_uint16(
543     RE_EMIT_CONTEXT* emit_context,
544     uint8_t opcode,
545     uint16_t argument,
546     YR_ARENA_REF* instruction_ref,
547     YR_ARENA_REF* argument_ref)
548 {
549   FAIL_ON_ERROR(yr_arena_write_data(
550       emit_context->arena,
551       YR_RE_CODE_SECTION,
552       &opcode,
553       sizeof(uint8_t),
554       instruction_ref));
555 
556   FAIL_ON_ERROR(yr_arena_write_data(
557       emit_context->arena,
558       YR_RE_CODE_SECTION,
559       &argument,
560       sizeof(uint16_t),
561       argument_ref));
562 
563   return ERROR_SUCCESS;
564 }
565 
_yr_emit_inst_arg_uint32(RE_EMIT_CONTEXT * emit_context,uint8_t opcode,uint32_t argument,YR_ARENA_REF * instruction_ref,YR_ARENA_REF * argument_ref)566 int _yr_emit_inst_arg_uint32(
567     RE_EMIT_CONTEXT* emit_context,
568     uint8_t opcode,
569     uint32_t argument,
570     YR_ARENA_REF* instruction_ref,
571     YR_ARENA_REF* argument_ref)
572 {
573   FAIL_ON_ERROR(yr_arena_write_data(
574       emit_context->arena,
575       YR_RE_CODE_SECTION,
576       &opcode,
577       sizeof(uint8_t),
578       instruction_ref));
579 
580   FAIL_ON_ERROR(yr_arena_write_data(
581       emit_context->arena,
582       YR_RE_CODE_SECTION,
583       &argument,
584       sizeof(uint32_t),
585       argument_ref));
586 
587   return ERROR_SUCCESS;
588 }
589 
_yr_emit_inst_arg_int16(RE_EMIT_CONTEXT * emit_context,uint8_t opcode,int16_t argument,YR_ARENA_REF * instruction_ref,YR_ARENA_REF * argument_ref)590 int _yr_emit_inst_arg_int16(
591     RE_EMIT_CONTEXT* emit_context,
592     uint8_t opcode,
593     int16_t argument,
594     YR_ARENA_REF* instruction_ref,
595     YR_ARENA_REF* argument_ref)
596 {
597   FAIL_ON_ERROR(yr_arena_write_data(
598       emit_context->arena,
599       YR_RE_CODE_SECTION,
600       &opcode,
601       sizeof(uint8_t),
602       instruction_ref));
603 
604   FAIL_ON_ERROR(yr_arena_write_data(
605       emit_context->arena,
606       YR_RE_CODE_SECTION,
607       &argument,
608       sizeof(int16_t),
609       argument_ref));
610 
611   return ERROR_SUCCESS;
612 }
613 
_yr_emit_inst_arg_struct(RE_EMIT_CONTEXT * emit_context,uint8_t opcode,void * structure,size_t structure_size,YR_ARENA_REF * instruction_ref,YR_ARENA_REF * argument_ref)614 int _yr_emit_inst_arg_struct(
615     RE_EMIT_CONTEXT* emit_context,
616     uint8_t opcode,
617     void* structure,
618     size_t structure_size,
619     YR_ARENA_REF* instruction_ref,
620     YR_ARENA_REF* argument_ref)
621 {
622   FAIL_ON_ERROR(yr_arena_write_data(
623       emit_context->arena,
624       YR_RE_CODE_SECTION,
625       &opcode,
626       sizeof(uint8_t),
627       instruction_ref));
628 
629   FAIL_ON_ERROR(yr_arena_write_data(
630       emit_context->arena,
631       YR_RE_CODE_SECTION,
632       structure,
633       structure_size,
634       argument_ref));
635 
636   return ERROR_SUCCESS;
637 }
638 
_yr_emit_split(RE_EMIT_CONTEXT * emit_context,uint8_t opcode,int16_t argument,YR_ARENA_REF * instruction_ref,YR_ARENA_REF * argument_ref)639 int _yr_emit_split(
640     RE_EMIT_CONTEXT* emit_context,
641     uint8_t opcode,
642     int16_t argument,
643     YR_ARENA_REF* instruction_ref,
644     YR_ARENA_REF* argument_ref)
645 {
646   assert(opcode == RE_OPCODE_SPLIT_A || opcode == RE_OPCODE_SPLIT_B);
647 
648   if (emit_context->next_split_id == RE_MAX_SPLIT_ID)
649     return ERROR_REGULAR_EXPRESSION_TOO_COMPLEX;
650 
651   FAIL_ON_ERROR(yr_arena_write_data(
652       emit_context->arena,
653       YR_RE_CODE_SECTION,
654       &opcode,
655       sizeof(uint8_t),
656       instruction_ref));
657 
658   FAIL_ON_ERROR(yr_arena_write_data(
659       emit_context->arena,
660       YR_RE_CODE_SECTION,
661       &emit_context->next_split_id,
662       sizeof(RE_SPLIT_ID_TYPE),
663       NULL));
664 
665   emit_context->next_split_id++;
666 
667   FAIL_ON_ERROR(yr_arena_write_data(
668       emit_context->arena,
669       YR_RE_CODE_SECTION,
670       &argument,
671       sizeof(int16_t),
672       argument_ref));
673 
674   return ERROR_SUCCESS;
675 }
676 
677 #define current_re_code_offset() \
678   yr_arena_get_current_offset(emit_context->arena, YR_RE_CODE_SECTION)
679 
_yr_re_emit(RE_EMIT_CONTEXT * emit_context,RE_NODE * re_node,int flags,YR_ARENA_REF * code_ref)680 static int _yr_re_emit(
681     RE_EMIT_CONTEXT* emit_context,
682     RE_NODE* re_node,
683     int flags,
684     YR_ARENA_REF* code_ref)
685 {
686   yr_arena_off_t jmp_offset;
687 
688   yr_arena_off_t bookmark_1 = 0;
689   yr_arena_off_t bookmark_2 = 0;
690   yr_arena_off_t bookmark_3 = 0;
691   yr_arena_off_t bookmark_4 = 0;
692 
693   bool emit_split;
694   bool emit_repeat;
695   bool emit_prolog;
696   bool emit_epilog;
697 
698   RE_REPEAT_ARGS repeat_args;
699   RE_REPEAT_ARGS* repeat_start_args_addr;
700   RE_REPEAT_ANY_ARGS repeat_any_args;
701 
702   RE_NODE* child;
703 
704   int16_t* split_offset_addr = NULL;
705   int16_t* jmp_offset_addr = NULL;
706 
707   YR_ARENA_REF instruction_ref = YR_ARENA_NULL_REF;
708   YR_ARENA_REF split_offset_ref;
709   YR_ARENA_REF jmp_instruction_ref;
710   YR_ARENA_REF jmp_offset_ref;
711   YR_ARENA_REF repeat_start_args_ref;
712 
713   switch (re_node->type)
714   {
715   case RE_NODE_LITERAL:
716     FAIL_ON_ERROR(_yr_emit_inst_arg_uint8(
717         emit_context,
718         RE_OPCODE_LITERAL,
719         re_node->value,
720         &instruction_ref,
721         NULL));
722     break;
723 
724   case RE_NODE_MASKED_LITERAL:
725     FAIL_ON_ERROR(_yr_emit_inst_arg_uint16(
726         emit_context,
727         RE_OPCODE_MASKED_LITERAL,
728         re_node->mask << 8 | re_node->value,
729         &instruction_ref,
730         NULL));
731     break;
732 
733   case RE_NODE_WORD_CHAR:
734     FAIL_ON_ERROR(
735         _yr_emit_inst(emit_context, RE_OPCODE_WORD_CHAR, &instruction_ref));
736     break;
737 
738   case RE_NODE_NON_WORD_CHAR:
739     FAIL_ON_ERROR(
740         _yr_emit_inst(emit_context, RE_OPCODE_NON_WORD_CHAR, &instruction_ref));
741     break;
742 
743   case RE_NODE_WORD_BOUNDARY:
744     FAIL_ON_ERROR(
745         _yr_emit_inst(emit_context, RE_OPCODE_WORD_BOUNDARY, &instruction_ref));
746     break;
747 
748   case RE_NODE_NON_WORD_BOUNDARY:
749     FAIL_ON_ERROR(_yr_emit_inst(
750         emit_context, RE_OPCODE_NON_WORD_BOUNDARY, &instruction_ref));
751     break;
752 
753   case RE_NODE_SPACE:
754     FAIL_ON_ERROR(
755         _yr_emit_inst(emit_context, RE_OPCODE_SPACE, &instruction_ref));
756     break;
757 
758   case RE_NODE_NON_SPACE:
759     FAIL_ON_ERROR(
760         _yr_emit_inst(emit_context, RE_OPCODE_NON_SPACE, &instruction_ref));
761     break;
762 
763   case RE_NODE_DIGIT:
764     FAIL_ON_ERROR(
765         _yr_emit_inst(emit_context, RE_OPCODE_DIGIT, &instruction_ref));
766     break;
767 
768   case RE_NODE_NON_DIGIT:
769     FAIL_ON_ERROR(
770         _yr_emit_inst(emit_context, RE_OPCODE_NON_DIGIT, &instruction_ref));
771     break;
772 
773   case RE_NODE_ANY:
774     FAIL_ON_ERROR(_yr_emit_inst(emit_context, RE_OPCODE_ANY, &instruction_ref));
775     break;
776 
777   case RE_NODE_CLASS:
778     FAIL_ON_ERROR(
779         _yr_emit_inst(emit_context, RE_OPCODE_CLASS, &instruction_ref));
780 
781     FAIL_ON_ERROR(yr_arena_write_data(
782         emit_context->arena,
783         YR_RE_CODE_SECTION,
784         re_node->re_class,
785         sizeof(*re_node->re_class),
786         NULL));
787     break;
788 
789   case RE_NODE_ANCHOR_START:
790     FAIL_ON_ERROR(_yr_emit_inst(
791         emit_context, RE_OPCODE_MATCH_AT_START, &instruction_ref));
792     break;
793 
794   case RE_NODE_ANCHOR_END:
795     FAIL_ON_ERROR(
796         _yr_emit_inst(emit_context, RE_OPCODE_MATCH_AT_END, &instruction_ref));
797     break;
798 
799   case RE_NODE_CONCAT:
800     FAIL_ON_ERROR(_yr_re_emit(
801         emit_context,
802         (flags & EMIT_BACKWARDS) ? re_node->children_tail
803                                  : re_node->children_head,
804         flags,
805         &instruction_ref));
806 
807     if (flags & EMIT_BACKWARDS)
808       child = re_node->children_tail->prev_sibling;
809     else
810       child = re_node->children_head->next_sibling;
811 
812     while (child != NULL)
813     {
814       FAIL_ON_ERROR(_yr_re_emit(emit_context, child, flags, NULL));
815 
816       child = (flags & EMIT_BACKWARDS) ? child->prev_sibling
817                                        : child->next_sibling;
818     }
819     break;
820 
821   case RE_NODE_PLUS:
822     // Code for e+ looks like:
823     //
824     //          L1: code for e
825     //              split L1, L2
826     //          L2:
827     //
828     FAIL_ON_ERROR(_yr_re_emit(
829         emit_context, re_node->children_head, flags, &instruction_ref));
830 
831     jmp_offset = instruction_ref.offset - current_re_code_offset();
832 
833     if (jmp_offset < INT16_MIN)
834       return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
835 
836     FAIL_ON_ERROR(_yr_emit_split(
837         emit_context,
838         re_node->greedy ? RE_OPCODE_SPLIT_B : RE_OPCODE_SPLIT_A,
839         (int16_t) jmp_offset,
840         NULL,
841         NULL));
842 
843     break;
844 
845   case RE_NODE_STAR:
846     // Code for e* looks like:
847     //
848     //          L1: split L1, L2
849     //              code for e
850     //              jmp L1
851     //          L2:
852     FAIL_ON_ERROR(_yr_emit_split(
853         emit_context,
854         re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
855         0,
856         &instruction_ref,
857         &split_offset_ref));
858 
859     FAIL_ON_ERROR(
860         _yr_re_emit(emit_context, re_node->children_head, flags, NULL));
861 
862     jmp_offset = instruction_ref.offset - current_re_code_offset();
863 
864     if (jmp_offset < INT16_MIN)
865       return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
866 
867     // Emit jump with offset set to 0.
868 
869     FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
870         emit_context, RE_OPCODE_JUMP, (int16_t) jmp_offset, NULL, NULL));
871 
872     jmp_offset = current_re_code_offset() - instruction_ref.offset;
873 
874     if (jmp_offset > INT16_MAX)
875       return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
876 
877     // Update split offset.
878     split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
879         emit_context->arena, &split_offset_ref);
880 
881     *split_offset_addr = (int16_t) jmp_offset;
882     break;
883 
884   case RE_NODE_ALT:
885     // Code for e1|e2 looks like:
886     //
887     //              split L1, L2
888     //          L1: code for e1
889     //              jmp L3
890     //          L2: code for e2
891     //          L3:
892 
893     // Emit a split instruction with offset set to 0 temporarily. Offset
894     // will be updated after we know the size of the code generated for
895     // the left node (e1).
896 
897     FAIL_ON_ERROR(_yr_emit_split(
898         emit_context,
899         RE_OPCODE_SPLIT_A,
900         0,
901         &instruction_ref,
902         &split_offset_ref));
903 
904     FAIL_ON_ERROR(
905         _yr_re_emit(emit_context, re_node->children_head, flags, NULL));
906 
907     // Emit jump with offset set to 0.
908 
909     FAIL_ON_ERROR(_yr_emit_inst_arg_int16(
910         emit_context,
911         RE_OPCODE_JUMP,
912         0,
913         &jmp_instruction_ref,
914         &jmp_offset_ref));
915 
916     jmp_offset = current_re_code_offset() - instruction_ref.offset;
917 
918     if (jmp_offset > INT16_MAX)
919       return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
920 
921     // Update split offset.
922     split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
923         emit_context->arena, &split_offset_ref);
924 
925     *split_offset_addr = (int16_t) jmp_offset;
926 
927     FAIL_ON_ERROR(
928         _yr_re_emit(emit_context, re_node->children_tail, flags, NULL));
929 
930     jmp_offset = current_re_code_offset() - jmp_instruction_ref.offset;
931 
932     if (jmp_offset > INT16_MAX)
933       return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
934 
935     // Update offset for jmp instruction.
936     jmp_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
937         emit_context->arena, &jmp_offset_ref);
938 
939     *jmp_offset_addr = (int16_t) jmp_offset;
940     break;
941 
942   case RE_NODE_RANGE_ANY:
943     repeat_any_args.min = re_node->start;
944     repeat_any_args.max = re_node->end;
945 
946     FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
947         emit_context,
948         re_node->greedy ? RE_OPCODE_REPEAT_ANY_GREEDY
949                         : RE_OPCODE_REPEAT_ANY_UNGREEDY,
950         &repeat_any_args,
951         sizeof(repeat_any_args),
952         &instruction_ref,
953         NULL));
954 
955     break;
956 
957   case RE_NODE_RANGE:
958     // Code for e{n,m} looks like:
959     //
960     //            code for e              ---   prolog
961     //            repeat_start n, m, L1   --+
962     //        L0: code for e                |   repeat
963     //            repeat_end n, m, L0     --+
964     //        L1: split L2, L3            ---   split
965     //        L2: code for e              ---   epilog
966     //        L3:
967     //
968     // Not all sections (prolog, repeat, split and epilog) are generated in all
969     // cases, it depends on the values of n and m. The following table shows
970     // which sections are generated for the first few values of n and m.
971     //
972     //        n,m   prolog  repeat      split  epilog
973     //                      (min,max)
974     //        ---------------------------------------
975     //        0,0     -       -           -      -
976     //        0,1     -       -           X      X
977     //        0,2     -       0,1         X      X
978     //        0,3     -       0,2         X      X
979     //        0,M     -       0,M-1       X      X
980     //
981     //        1,1     X       -           -      -
982     //        1,2     X       -           X      X
983     //        1,3     X       0,1         X      X
984     //        1,4     X       1,2         X      X
985     //        1,M     X       1,M-2       X      X
986     //
987     //        2,2     X       -           -      X
988     //        2,3     X       1,1         X      X
989     //        2,4     X       1,2         X      X
990     //        2,M     X       1,M-2       X      X
991     //
992     //        3,3     X       1,1         -      X
993     //        3,4     X       2,2         X      X
994     //        3,M     X       2,M-2       X      X
995     //
996     //        4,4     X       2,2         -      X
997     //        4,5     X       3,3         X      X
998     //        4,M     X       3,M-2       X      X
999     //
1000     // The code can't consists simply in the repeat section, the prolog and
1001     // epilog are required because we can't have atoms pointing to code inside
1002     // the repeat loop. Atoms' forwards_code will point to code in the prolog
1003     // and backwards_code will point to code in the epilog (or in prolog if
1004     // epilog wasn't generated, like in n=1,m=1)
1005 
1006     emit_prolog = re_node->start > 0;
1007     emit_repeat = re_node->end > re_node->start + 1 || re_node->end > 2;
1008     emit_split = re_node->end > re_node->start;
1009     emit_epilog = re_node->end > re_node->start || re_node->end > 1;
1010 
1011     if (emit_prolog)
1012     {
1013       FAIL_ON_ERROR(_yr_re_emit(
1014           emit_context, re_node->children_head, flags, &instruction_ref));
1015     }
1016 
1017     if (emit_repeat)
1018     {
1019       repeat_args.min = re_node->start;
1020       repeat_args.max = re_node->end;
1021 
1022       if (emit_prolog)
1023       {
1024         repeat_args.max--;
1025         repeat_args.min--;
1026       }
1027 
1028       if (emit_split)
1029       {
1030         repeat_args.max--;
1031       }
1032       else
1033       {
1034         repeat_args.min--;
1035         repeat_args.max--;
1036       }
1037 
1038       repeat_args.offset = 0;
1039 
1040       bookmark_1 = current_re_code_offset();
1041 
1042       FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
1043           emit_context,
1044           re_node->greedy ? RE_OPCODE_REPEAT_START_GREEDY
1045                           : RE_OPCODE_REPEAT_START_UNGREEDY,
1046           &repeat_args,
1047           sizeof(repeat_args),
1048           emit_prolog ? NULL : &instruction_ref,
1049           &repeat_start_args_ref));
1050 
1051       bookmark_2 = current_re_code_offset();
1052 
1053       FAIL_ON_ERROR(_yr_re_emit(
1054           emit_context,
1055           re_node->children_head,
1056           flags | EMIT_DONT_SET_FORWARDS_CODE | EMIT_DONT_SET_BACKWARDS_CODE,
1057           NULL));
1058 
1059       bookmark_3 = current_re_code_offset();
1060 
1061       if (bookmark_2 - bookmark_3 < INT32_MIN)
1062         return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1063 
1064       repeat_args.offset = (int32_t)(bookmark_2 - bookmark_3);
1065 
1066       FAIL_ON_ERROR(_yr_emit_inst_arg_struct(
1067           emit_context,
1068           re_node->greedy ? RE_OPCODE_REPEAT_END_GREEDY
1069                           : RE_OPCODE_REPEAT_END_UNGREEDY,
1070           &repeat_args,
1071           sizeof(repeat_args),
1072           NULL,
1073           NULL));
1074 
1075       bookmark_4 = current_re_code_offset();
1076 
1077       repeat_start_args_addr = (RE_REPEAT_ARGS*) yr_arena_ref_to_ptr(
1078           emit_context->arena, &repeat_start_args_ref);
1079 
1080       if (bookmark_4 - bookmark_1 > INT32_MAX)
1081         return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1082 
1083       repeat_start_args_addr->offset = (int32_t)(bookmark_4 - bookmark_1);
1084     }
1085 
1086     if (emit_split)
1087     {
1088       bookmark_1 = current_re_code_offset();
1089 
1090       FAIL_ON_ERROR(_yr_emit_split(
1091           emit_context,
1092           re_node->greedy ? RE_OPCODE_SPLIT_A : RE_OPCODE_SPLIT_B,
1093           0,
1094           NULL,
1095           &split_offset_ref));
1096     }
1097 
1098     if (emit_epilog)
1099     {
1100       FAIL_ON_ERROR(_yr_re_emit(
1101           emit_context,
1102           re_node->children_head,
1103           emit_prolog ? flags | EMIT_DONT_SET_FORWARDS_CODE : flags,
1104           emit_prolog || emit_repeat ? NULL : &instruction_ref));
1105     }
1106 
1107     if (emit_split)
1108     {
1109       bookmark_2 = current_re_code_offset();
1110 
1111       if (bookmark_2 - bookmark_1 > INT16_MAX)
1112         return ERROR_REGULAR_EXPRESSION_TOO_LARGE;
1113 
1114       split_offset_addr = (int16_t*) yr_arena_ref_to_ptr(
1115           emit_context->arena, &split_offset_ref);
1116 
1117       *split_offset_addr = (int16_t)(bookmark_2 - bookmark_1);
1118     }
1119 
1120     break;
1121   }
1122 
1123   if (flags & EMIT_BACKWARDS)
1124   {
1125     if (!(flags & EMIT_DONT_SET_BACKWARDS_CODE))
1126     {
1127       re_node->backward_code_ref.buffer_id = YR_RE_CODE_SECTION;
1128       re_node->backward_code_ref.offset = yr_arena_get_current_offset(
1129           emit_context->arena, YR_RE_CODE_SECTION);
1130     }
1131   }
1132   else
1133   {
1134     if (!(flags & EMIT_DONT_SET_FORWARDS_CODE))
1135     {
1136       re_node->forward_code_ref = instruction_ref;
1137     }
1138   }
1139 
1140   if (code_ref != NULL)
1141     *code_ref = instruction_ref;
1142 
1143   return ERROR_SUCCESS;
1144 }
1145 
yr_re_ast_emit_code(RE_AST * re_ast,YR_ARENA * arena,int backwards_code)1146 int yr_re_ast_emit_code(RE_AST* re_ast, YR_ARENA* arena, int backwards_code)
1147 {
1148   RE_EMIT_CONTEXT emit_context;
1149 
1150   // Emit code for matching the regular expressions forwards.
1151   emit_context.arena = arena;
1152   emit_context.next_split_id = 0;
1153 
1154   FAIL_ON_ERROR(_yr_re_emit(
1155       &emit_context,
1156       re_ast->root_node,
1157       backwards_code ? EMIT_BACKWARDS : 0,
1158       NULL));
1159 
1160   FAIL_ON_ERROR(_yr_emit_inst(&emit_context, RE_OPCODE_MATCH, NULL));
1161 
1162   return ERROR_SUCCESS;
1163 }
1164 
_yr_re_fiber_create(RE_FIBER_POOL * fiber_pool,RE_FIBER ** new_fiber)1165 static int _yr_re_fiber_create(RE_FIBER_POOL* fiber_pool, RE_FIBER** new_fiber)
1166 {
1167   RE_FIBER* fiber;
1168 
1169   if (fiber_pool->fibers.head != NULL)
1170   {
1171     fiber = fiber_pool->fibers.head;
1172     fiber_pool->fibers.head = fiber->next;
1173 
1174     if (fiber_pool->fibers.tail == fiber)
1175       fiber_pool->fibers.tail = NULL;
1176   }
1177   else
1178   {
1179     if (fiber_pool->fiber_count == RE_MAX_FIBERS)
1180       return ERROR_TOO_MANY_RE_FIBERS;
1181 
1182     fiber = (RE_FIBER*) yr_malloc(sizeof(RE_FIBER));
1183 
1184     if (fiber == NULL)
1185       return ERROR_INSUFFICIENT_MEMORY;
1186 
1187     fiber_pool->fiber_count++;
1188   }
1189 
1190   fiber->ip = NULL;
1191   fiber->sp = -1;
1192   fiber->rc = -1;
1193   fiber->next = NULL;
1194   fiber->prev = NULL;
1195 
1196   *new_fiber = fiber;
1197 
1198   return ERROR_SUCCESS;
1199 }
1200 
1201 ////////////////////////////////////////////////////////////////////////////////
1202 // Appends 'fiber' to 'fiber_list'
1203 //
_yr_re_fiber_append(RE_FIBER_LIST * fiber_list,RE_FIBER * fiber)1204 static void _yr_re_fiber_append(RE_FIBER_LIST* fiber_list, RE_FIBER* fiber)
1205 {
1206   assert(fiber->prev == NULL);
1207   assert(fiber->next == NULL);
1208 
1209   fiber->prev = fiber_list->tail;
1210 
1211   if (fiber_list->tail != NULL)
1212     fiber_list->tail->next = fiber;
1213 
1214   fiber_list->tail = fiber;
1215 
1216   if (fiber_list->head == NULL)
1217     fiber_list->head = fiber;
1218 
1219   assert(fiber_list->tail->next == NULL);
1220   assert(fiber_list->head->prev == NULL);
1221 }
1222 
1223 ////////////////////////////////////////////////////////////////////////////////
1224 // Verifies if a fiber with the same properties (ip, rc, sp, and stack values)
1225 // than 'target_fiber' exists in 'fiber_list'. The list is iterated from
1226 // the start until 'last_fiber' (inclusive). Fibers past 'last_fiber' are not
1227 // taken into account.
1228 //
_yr_re_fiber_exists(RE_FIBER_LIST * fiber_list,RE_FIBER * target_fiber,RE_FIBER * last_fiber)1229 static int _yr_re_fiber_exists(
1230     RE_FIBER_LIST* fiber_list,
1231     RE_FIBER* target_fiber,
1232     RE_FIBER* last_fiber)
1233 {
1234   RE_FIBER* fiber = fiber_list->head;
1235 
1236   int equal_stacks;
1237   int i;
1238 
1239   if (last_fiber == NULL)
1240     return false;
1241 
1242   while (fiber != last_fiber->next)
1243   {
1244     if (fiber->ip == target_fiber->ip && fiber->sp == target_fiber->sp &&
1245         fiber->rc == target_fiber->rc)
1246     {
1247       equal_stacks = true;
1248 
1249       for (i = 0; i <= fiber->sp; i++)
1250       {
1251         if (fiber->stack[i] != target_fiber->stack[i])
1252         {
1253           equal_stacks = false;
1254           break;
1255         }
1256       }
1257 
1258       if (equal_stacks)
1259         return true;
1260     }
1261 
1262     fiber = fiber->next;
1263   }
1264 
1265   return false;
1266 }
1267 
1268 ////////////////////////////////////////////////////////////////////////////////
1269 // Clones a fiber in fiber_list and inserts the cloned fiber just after.
1270 // the original one. If fiber_list is:
1271 //
1272 //   f1 -> f2 -> f3 -> f4
1273 //
1274 // Splitting f2 will result in:
1275 //
1276 //   f1 -> f2 -> cloned f2 -> f3 -> f4
1277 //
_yr_re_fiber_split(RE_FIBER_LIST * fiber_list,RE_FIBER_POOL * fiber_pool,RE_FIBER * fiber,RE_FIBER ** new_fiber)1278 static int _yr_re_fiber_split(
1279     RE_FIBER_LIST* fiber_list,
1280     RE_FIBER_POOL* fiber_pool,
1281     RE_FIBER* fiber,
1282     RE_FIBER** new_fiber)
1283 {
1284   int32_t i;
1285 
1286   FAIL_ON_ERROR(_yr_re_fiber_create(fiber_pool, new_fiber));
1287 
1288   (*new_fiber)->sp = fiber->sp;
1289   (*new_fiber)->ip = fiber->ip;
1290   (*new_fiber)->rc = fiber->rc;
1291 
1292   for (i = 0; i <= fiber->sp; i++) (*new_fiber)->stack[i] = fiber->stack[i];
1293 
1294   (*new_fiber)->next = fiber->next;
1295   (*new_fiber)->prev = fiber;
1296 
1297   if (fiber->next != NULL)
1298     fiber->next->prev = *new_fiber;
1299 
1300   fiber->next = *new_fiber;
1301 
1302   if (fiber_list->tail == fiber)
1303     fiber_list->tail = *new_fiber;
1304 
1305   assert(fiber_list->tail->next == NULL);
1306   assert(fiber_list->head->prev == NULL);
1307 
1308   return ERROR_SUCCESS;
1309 }
1310 
1311 ////////////////////////////////////////////////////////////////////////////////
1312 // Kills a given fiber by removing it from the fiber list and putting it in the
1313 // fiber pool.
1314 //
_yr_re_fiber_kill(RE_FIBER_LIST * fiber_list,RE_FIBER_POOL * fiber_pool,RE_FIBER * fiber)1315 static RE_FIBER* _yr_re_fiber_kill(
1316     RE_FIBER_LIST* fiber_list,
1317     RE_FIBER_POOL* fiber_pool,
1318     RE_FIBER* fiber)
1319 {
1320   RE_FIBER* next_fiber = fiber->next;
1321 
1322   if (fiber->prev != NULL)
1323     fiber->prev->next = next_fiber;
1324 
1325   if (next_fiber != NULL)
1326     next_fiber->prev = fiber->prev;
1327 
1328   if (fiber_pool->fibers.tail != NULL)
1329     fiber_pool->fibers.tail->next = fiber;
1330 
1331   if (fiber_list->tail == fiber)
1332     fiber_list->tail = fiber->prev;
1333 
1334   if (fiber_list->head == fiber)
1335     fiber_list->head = next_fiber;
1336 
1337   fiber->next = NULL;
1338   fiber->prev = fiber_pool->fibers.tail;
1339   fiber_pool->fibers.tail = fiber;
1340 
1341   if (fiber_pool->fibers.head == NULL)
1342     fiber_pool->fibers.head = fiber;
1343 
1344   return next_fiber;
1345 }
1346 
1347 ////////////////////////////////////////////////////////////////////////////////
1348 // Kills all fibers from the given one up to the end of the fiber list.
1349 //
_yr_re_fiber_kill_tail(RE_FIBER_LIST * fiber_list,RE_FIBER_POOL * fiber_pool,RE_FIBER * fiber)1350 static void _yr_re_fiber_kill_tail(
1351     RE_FIBER_LIST* fiber_list,
1352     RE_FIBER_POOL* fiber_pool,
1353     RE_FIBER* fiber)
1354 {
1355   RE_FIBER* prev_fiber = fiber->prev;
1356 
1357   if (prev_fiber != NULL)
1358     prev_fiber->next = NULL;
1359 
1360   fiber->prev = fiber_pool->fibers.tail;
1361 
1362   if (fiber_pool->fibers.tail != NULL)
1363     fiber_pool->fibers.tail->next = fiber;
1364 
1365   fiber_pool->fibers.tail = fiber_list->tail;
1366   fiber_list->tail = prev_fiber;
1367 
1368   if (fiber_list->head == fiber)
1369     fiber_list->head = NULL;
1370 
1371   if (fiber_pool->fibers.head == NULL)
1372     fiber_pool->fibers.head = fiber;
1373 }
1374 
1375 ////////////////////////////////////////////////////////////////////////////////
1376 // Kills all fibers in the fiber list.
1377 //
_yr_re_fiber_kill_all(RE_FIBER_LIST * fiber_list,RE_FIBER_POOL * fiber_pool)1378 static void _yr_re_fiber_kill_all(
1379     RE_FIBER_LIST* fiber_list,
1380     RE_FIBER_POOL* fiber_pool)
1381 {
1382   if (fiber_list->head != NULL)
1383     _yr_re_fiber_kill_tail(fiber_list, fiber_pool, fiber_list->head);
1384 }
1385 
1386 ////////////////////////////////////////////////////////////////////////////////
1387 // Executes a fiber until reaching an "matching" instruction. A "matching"
1388 // instruction is one that actually reads a byte from the input and performs
1389 // some matching. If the fiber reaches a split instruction, the new fiber is
1390 // also synced.
1391 //
_yr_re_fiber_sync(RE_FIBER_LIST * fiber_list,RE_FIBER_POOL * fiber_pool,RE_FIBER * fiber_to_sync)1392 static int _yr_re_fiber_sync(
1393     RE_FIBER_LIST* fiber_list,
1394     RE_FIBER_POOL* fiber_pool,
1395     RE_FIBER* fiber_to_sync)
1396 {
1397   // A array for keeping track of which split instructions has been already
1398   // executed. Each split instruction within a regexp has an associated ID
1399   // between 0 and RE_MAX_SPLIT_ID-1. Keeping track of executed splits is
1400   // required to avoid infinite loops in regexps like (a*)* or (a|)*
1401 
1402   RE_SPLIT_ID_TYPE splits_executed[RE_MAX_SPLIT_ID];
1403   RE_SPLIT_ID_TYPE splits_executed_count = 0;
1404   RE_SPLIT_ID_TYPE split_id, splits_executed_idx;
1405 
1406   int split_already_executed;
1407 
1408   RE_REPEAT_ARGS* repeat_args;
1409   RE_REPEAT_ANY_ARGS* repeat_any_args;
1410 
1411   RE_FIBER* fiber;
1412   RE_FIBER* last;
1413   RE_FIBER* next;
1414   RE_FIBER* branch_a;
1415   RE_FIBER* branch_b;
1416 
1417   fiber = fiber_to_sync;
1418   last = fiber_to_sync->next;
1419 
1420   while (fiber != last)
1421   {
1422     uint8_t opcode = *fiber->ip;
1423 
1424     switch (opcode)
1425     {
1426     case RE_OPCODE_SPLIT_A:
1427     case RE_OPCODE_SPLIT_B:
1428 
1429       split_id = *(RE_SPLIT_ID_TYPE*) (fiber->ip + 1);
1430       split_already_executed = false;
1431 
1432       for (splits_executed_idx = 0; splits_executed_idx < splits_executed_count;
1433            splits_executed_idx++)
1434       {
1435         if (split_id == splits_executed[splits_executed_idx])
1436         {
1437           split_already_executed = true;
1438           break;
1439         }
1440       }
1441 
1442       if (split_already_executed)
1443       {
1444         fiber = _yr_re_fiber_kill(fiber_list, fiber_pool, fiber);
1445       }
1446       else
1447       {
1448         branch_a = fiber;
1449 
1450         FAIL_ON_ERROR(
1451             _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1452 
1453         // With RE_OPCODE_SPLIT_A the current fiber continues at the next
1454         // instruction in the stream (branch A), while the newly created
1455         // fiber starts at the address indicated by the instruction (branch B)
1456         // RE_OPCODE_SPLIT_B has the opposite behavior.
1457 
1458         if (opcode == RE_OPCODE_SPLIT_B)
1459           yr_swap(branch_a, branch_b, RE_FIBER*);
1460 
1461         // Branch A continues at the next instruction
1462         branch_a->ip += (sizeof(RE_SPLIT_ID_TYPE) + 3);
1463 
1464         // Branch B adds the offset encoded in the opcode to its instruction
1465         // pointer.
1466         branch_b->ip += *(int16_t*)(
1467               branch_b->ip
1468               + 1  // opcode size
1469               + sizeof(RE_SPLIT_ID_TYPE));
1470 
1471 #ifdef YR_PARANOID_MODE
1472         // In normal conditions this should never happen. But with compiled
1473         // rules that has been hand-crafted by a malicious actor this could
1474         // happen.
1475         if (splits_executed_count >= RE_MAX_SPLIT_ID)
1476           return ERROR_INTERNAL_FATAL_ERROR;
1477 #endif
1478 
1479         splits_executed[splits_executed_count] = split_id;
1480         splits_executed_count++;
1481       }
1482 
1483       break;
1484 
1485     case RE_OPCODE_REPEAT_START_GREEDY:
1486     case RE_OPCODE_REPEAT_START_UNGREEDY:
1487 
1488       repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1);
1489       assert(repeat_args->max > 0);
1490       branch_a = fiber;
1491 
1492       if (repeat_args->min == 0)
1493       {
1494         FAIL_ON_ERROR(
1495             _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1496 
1497         if (opcode == RE_OPCODE_REPEAT_START_UNGREEDY)
1498           yr_swap(branch_a, branch_b, RE_FIBER*);
1499 
1500         branch_b->ip += repeat_args->offset;
1501       }
1502 
1503       branch_a->stack[++branch_a->sp] = 0;
1504       branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS));
1505       break;
1506 
1507     case RE_OPCODE_REPEAT_END_GREEDY:
1508     case RE_OPCODE_REPEAT_END_UNGREEDY:
1509 
1510       repeat_args = (RE_REPEAT_ARGS*) (fiber->ip + 1);
1511       fiber->stack[fiber->sp]++;
1512 
1513       if (fiber->stack[fiber->sp] < repeat_args->min)
1514       {
1515         fiber->ip += repeat_args->offset;
1516         break;
1517       }
1518 
1519       branch_a = fiber;
1520 
1521       if (fiber->stack[fiber->sp] < repeat_args->max)
1522       {
1523         FAIL_ON_ERROR(
1524             _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1525 
1526         if (opcode == RE_OPCODE_REPEAT_END_GREEDY)
1527           yr_swap(branch_a, branch_b, RE_FIBER*);
1528 
1529         branch_b->ip += repeat_args->offset;
1530       }
1531 
1532       branch_a->sp--;
1533       branch_a->ip += (1 + sizeof(RE_REPEAT_ARGS));
1534       break;
1535 
1536     case RE_OPCODE_REPEAT_ANY_GREEDY:
1537     case RE_OPCODE_REPEAT_ANY_UNGREEDY:
1538 
1539       repeat_any_args = (RE_REPEAT_ANY_ARGS*) (fiber->ip + 1);
1540 
1541       // If repetition counter (rc) is -1 it means that we are reaching this
1542       // instruction from the previous one in the instructions stream. In
1543       // this case let's initialize the counter to 0 and start looping.
1544 
1545       if (fiber->rc == -1)
1546         fiber->rc = 0;
1547 
1548       if (fiber->rc < repeat_any_args->min)
1549       {
1550         // Increase repetition counter and continue with next fiber. The
1551         // instruction pointer for this fiber is not incremented yet, this
1552         // fiber spins in this same instruction until reaching the minimum
1553         // number of repetitions.
1554 
1555         fiber->rc++;
1556         fiber = fiber->next;
1557       }
1558       else if (fiber->rc < repeat_any_args->max)
1559       {
1560         // Once the minimum number of repetitions are matched one fiber
1561         // remains spinning in this instruction until reaching the maximum
1562         // number of repetitions while new fibers are created. New fibers
1563         // start executing at the next instruction.
1564 
1565         next = fiber->next;
1566         branch_a = fiber;
1567 
1568         FAIL_ON_ERROR(
1569             _yr_re_fiber_split(fiber_list, fiber_pool, branch_a, &branch_b));
1570 
1571         if (opcode == RE_OPCODE_REPEAT_ANY_UNGREEDY)
1572           yr_swap(branch_a, branch_b, RE_FIBER*);
1573 
1574         branch_a->rc++;
1575         branch_b->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS));
1576         branch_b->rc = -1;
1577 
1578         FAIL_ON_ERROR(_yr_re_fiber_sync(fiber_list, fiber_pool, branch_b));
1579 
1580         fiber = next;
1581       }
1582       else
1583       {
1584         // When the maximum number of repetitions is reached the fiber keeps
1585         // executing at the next instruction. The repetition counter is set
1586         // to -1 indicating that we are not spinning in a repeat instruction
1587         // anymore.
1588 
1589         fiber->ip += (1 + sizeof(RE_REPEAT_ANY_ARGS));
1590         fiber->rc = -1;
1591       }
1592 
1593       break;
1594 
1595     case RE_OPCODE_JUMP:
1596       fiber->ip += *(int16_t*) (fiber->ip + 1);
1597       break;
1598 
1599     default:
1600       fiber = fiber->next;
1601     }
1602   }
1603 
1604   return ERROR_SUCCESS;
1605 }
1606 
1607 ////////////////////////////////////////////////////////////////////////////////
1608 // Executes a regular expression. The specified regular expression will try to
1609 // match the data starting at the address specified by "input". The "input"
1610 // pointer can point to any address inside a memory buffer. Arguments
1611 // "input_forwards_size" and "input_backwards_size" indicate how many bytes
1612 // can be accessible starting at "input" and going forwards and backwards
1613 // respectively.
1614 //
1615 //   <--- input_backwards_size -->|<----------- input_forwards_size -------->
1616 //  |--------  memory buffer  -----------------------------------------------|
1617 //                                ^
1618 //                              input
1619 //
1620 // Args:
1621 //   YR_SCAN_CONTEXT *context         - Scan context.
1622 //   const uint8_t* code              - Regexp code be executed
1623 //   const uint8_t* input             - Pointer to input data
1624 //   size_t input_forwards_size       - Number of accessible bytes starting at
1625 //                                      "input" and going forwards.
1626 //   size_t input_backwards_size      - Number of accessible bytes starting at
1627 //                                      "input" and going backwards
1628 //   int flags                        - Flags:
1629 //      RE_FLAGS_SCAN
1630 //      RE_FLAGS_BACKWARDS
1631 //      RE_FLAGS_EXHAUSTIVE
1632 //      RE_FLAGS_WIDE
1633 //      RE_FLAGS_NO_CASE
1634 //      RE_FLAGS_DOT_ALL
1635 //   RE_MATCH_CALLBACK_FUNC callback  - Callback function
1636 //   void* callback_args              - Callback argument
1637 //   int*  matches                    - Pointer to an integer receiving the
1638 //                                      number of matching bytes. Notice that
1639 //                                      0 means a zero-length match, while no
1640 //                                      matches is -1.
1641 // Returns:
1642 //    ERROR_SUCCESS or any other error code.
1643 
yr_re_exec(YR_SCAN_CONTEXT * context,const uint8_t * code,const uint8_t * input_data,size_t input_forwards_size,size_t input_backwards_size,int flags,RE_MATCH_CALLBACK_FUNC callback,void * callback_args,int * matches)1644 int yr_re_exec(
1645     YR_SCAN_CONTEXT* context,
1646     const uint8_t* code,
1647     const uint8_t* input_data,
1648     size_t input_forwards_size,
1649     size_t input_backwards_size,
1650     int flags,
1651     RE_MATCH_CALLBACK_FUNC callback,
1652     void* callback_args,
1653     int* matches)
1654 {
1655   const uint8_t* input;
1656   const uint8_t* ip;
1657 
1658   uint8_t mask;
1659   uint8_t value;
1660   uint8_t character_size;
1661 
1662   RE_FIBER_LIST fibers;
1663   RE_FIBER* fiber;
1664   RE_FIBER* next_fiber;
1665 
1666   int bytes_matched;
1667   int max_bytes_matched;
1668 
1669   int match;
1670   int input_incr;
1671   int kill;
1672   int action;
1673 
1674 #define ACTION_NONE      0
1675 #define ACTION_CONTINUE  1
1676 #define ACTION_KILL      2
1677 #define ACTION_KILL_TAIL 3
1678 
1679 #define prolog                                      \
1680   {                                                 \
1681     if ((bytes_matched >= max_bytes_matched) ||     \
1682         (character_size == 2 && *(input + 1) != 0)) \
1683     {                                               \
1684       action = ACTION_KILL;                         \
1685       break;                                        \
1686     }                                               \
1687   }
1688 
1689   if (matches != NULL)
1690     *matches = -1;
1691 
1692   if (flags & RE_FLAGS_WIDE)
1693     character_size = 2;
1694   else
1695     character_size = 1;
1696 
1697   input = input_data;
1698   input_incr = character_size;
1699 
1700   if (flags & RE_FLAGS_BACKWARDS)
1701   {
1702     max_bytes_matched = (int) yr_min(input_backwards_size, YR_RE_SCAN_LIMIT);
1703     input -= character_size;
1704     input_incr = -input_incr;
1705   }
1706   else
1707   {
1708     max_bytes_matched = (int) yr_min(input_forwards_size, YR_RE_SCAN_LIMIT);
1709   }
1710 
1711   // Round down max_bytes_matched to a multiple of character_size, this way if
1712   // character_size is 2 and max_bytes_matched is odd we are ignoring the
1713   // extra byte which can't match anyways.
1714 
1715   max_bytes_matched = max_bytes_matched - max_bytes_matched % character_size;
1716   bytes_matched = 0;
1717 
1718   FAIL_ON_ERROR(_yr_re_fiber_create(&context->re_fiber_pool, &fiber));
1719 
1720   fiber->ip = code;
1721   fibers.head = fiber;
1722   fibers.tail = fiber;
1723 
1724   FAIL_ON_ERROR_WITH_CLEANUP(
1725       _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
1726       _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1727 
1728   while (fibers.head != NULL)
1729   {
1730     fiber = fibers.head;
1731 
1732     while (fiber != NULL)
1733     {
1734       next_fiber = fiber->next;
1735 
1736       if (_yr_re_fiber_exists(&fibers, fiber, fiber->prev))
1737         _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber);
1738 
1739       fiber = next_fiber;
1740     }
1741 
1742     fiber = fibers.head;
1743 
1744     while (fiber != NULL)
1745     {
1746       ip = fiber->ip;
1747       action = ACTION_NONE;
1748 
1749       switch (*ip)
1750       {
1751       case RE_OPCODE_ANY:
1752         prolog;
1753         match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A);
1754         action = match ? ACTION_NONE : ACTION_KILL;
1755         fiber->ip += 1;
1756         break;
1757 
1758       case RE_OPCODE_REPEAT_ANY_GREEDY:
1759       case RE_OPCODE_REPEAT_ANY_UNGREEDY:
1760         prolog;
1761         match = (flags & RE_FLAGS_DOT_ALL) || (*input != 0x0A);
1762         action = match ? ACTION_NONE : ACTION_KILL;
1763 
1764         // The instruction pointer is not incremented here. The current fiber
1765         // spins in this instruction until reaching the required number of
1766         // repetitions. The code controlling the number of repetitions is in
1767         // _yr_re_fiber_sync.
1768 
1769         break;
1770 
1771       case RE_OPCODE_LITERAL:
1772         prolog;
1773         if (flags & RE_FLAGS_NO_CASE)
1774           match = yr_lowercase[*input] == yr_lowercase[*(ip + 1)];
1775         else
1776           match = (*input == *(ip + 1));
1777         action = match ? ACTION_NONE : ACTION_KILL;
1778         fiber->ip += 2;
1779         break;
1780 
1781       case RE_OPCODE_MASKED_LITERAL:
1782         prolog;
1783         value = *(int16_t*) (ip + 1) & 0xFF;
1784         mask = *(int16_t*) (ip + 1) >> 8;
1785 
1786         // We don't need to take into account the case-insensitive
1787         // case because this opcode is only used with hex strings,
1788         // which can't be case-insensitive.
1789 
1790         match = ((*input & mask) == value);
1791         action = match ? ACTION_NONE : ACTION_KILL;
1792         fiber->ip += 3;
1793         break;
1794 
1795       case RE_OPCODE_CLASS:
1796         prolog;
1797         match = _yr_re_is_char_in_class(
1798             (RE_CLASS*) (ip + 1), *input, flags & RE_FLAGS_NO_CASE);
1799         action = match ? ACTION_NONE : ACTION_KILL;
1800         fiber->ip += (sizeof(RE_CLASS) + 1);
1801         break;
1802 
1803       case RE_OPCODE_WORD_CHAR:
1804         prolog;
1805         match = _yr_re_is_word_char(input, character_size);
1806         action = match ? ACTION_NONE : ACTION_KILL;
1807         fiber->ip += 1;
1808         break;
1809 
1810       case RE_OPCODE_NON_WORD_CHAR:
1811         prolog;
1812         match = !_yr_re_is_word_char(input, character_size);
1813         action = match ? ACTION_NONE : ACTION_KILL;
1814         fiber->ip += 1;
1815         break;
1816 
1817       case RE_OPCODE_SPACE:
1818       case RE_OPCODE_NON_SPACE:
1819 
1820         prolog;
1821 
1822         switch (*input)
1823         {
1824         case ' ':
1825         case '\t':
1826         case '\r':
1827         case '\n':
1828         case '\v':
1829         case '\f':
1830           match = true;
1831           break;
1832         default:
1833           match = false;
1834         }
1835 
1836         if (*ip == RE_OPCODE_NON_SPACE)
1837           match = !match;
1838 
1839         action = match ? ACTION_NONE : ACTION_KILL;
1840         fiber->ip += 1;
1841         break;
1842 
1843       case RE_OPCODE_DIGIT:
1844         prolog;
1845         match = isdigit(*input);
1846         action = match ? ACTION_NONE : ACTION_KILL;
1847         fiber->ip += 1;
1848         break;
1849 
1850       case RE_OPCODE_NON_DIGIT:
1851         prolog;
1852         match = !isdigit(*input);
1853         action = match ? ACTION_NONE : ACTION_KILL;
1854         fiber->ip += 1;
1855         break;
1856 
1857       case RE_OPCODE_WORD_BOUNDARY:
1858       case RE_OPCODE_NON_WORD_BOUNDARY:
1859 
1860         if (bytes_matched == 0 && input_backwards_size < character_size)
1861         {
1862           match = true;
1863         }
1864         else if (bytes_matched >= max_bytes_matched)
1865         {
1866           match = true;
1867         }
1868         else
1869         {
1870           assert(input < input_data + input_forwards_size);
1871           assert(input >= input_data - input_backwards_size);
1872 
1873           assert(input - input_incr < input_data + input_forwards_size);
1874           assert(input - input_incr >= input_data - input_backwards_size);
1875 
1876           match = _yr_re_is_word_char(input, character_size) !=
1877                   _yr_re_is_word_char(input - input_incr, character_size);
1878         }
1879 
1880         if (*ip == RE_OPCODE_NON_WORD_BOUNDARY)
1881           match = !match;
1882 
1883         action = match ? ACTION_CONTINUE : ACTION_KILL;
1884         fiber->ip += 1;
1885         break;
1886 
1887       case RE_OPCODE_MATCH_AT_START:
1888         if (flags & RE_FLAGS_BACKWARDS)
1889           kill = input_backwards_size > (size_t) bytes_matched;
1890         else
1891           kill = input_backwards_size > 0 || (bytes_matched != 0);
1892         action = kill ? ACTION_KILL : ACTION_CONTINUE;
1893         fiber->ip += 1;
1894         break;
1895 
1896       case RE_OPCODE_MATCH_AT_END:
1897         kill = flags & RE_FLAGS_BACKWARDS ||
1898                input_forwards_size > (size_t) bytes_matched;
1899         action = kill ? ACTION_KILL : ACTION_CONTINUE;
1900         fiber->ip += 1;
1901         break;
1902 
1903       case RE_OPCODE_MATCH:
1904 
1905         if (matches != NULL)
1906           *matches = bytes_matched;
1907 
1908         if (flags & RE_FLAGS_EXHAUSTIVE)
1909         {
1910           if (callback != NULL)
1911           {
1912             if (flags & RE_FLAGS_BACKWARDS)
1913             {
1914               FAIL_ON_ERROR_WITH_CLEANUP(
1915                   callback(
1916                       input + character_size,
1917                       bytes_matched,
1918                       flags,
1919                       callback_args),
1920                   _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1921             }
1922             else
1923             {
1924               FAIL_ON_ERROR_WITH_CLEANUP(
1925                   callback(input_data, bytes_matched, flags, callback_args),
1926                   _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1927             }
1928           }
1929 
1930           action = ACTION_KILL;
1931         }
1932         else
1933         {
1934           action = ACTION_KILL_TAIL;
1935         }
1936 
1937         break;
1938 
1939       default:
1940         assert(false);
1941       }
1942 
1943       switch (action)
1944       {
1945       case ACTION_KILL:
1946         fiber = _yr_re_fiber_kill(&fibers, &context->re_fiber_pool, fiber);
1947         break;
1948 
1949       case ACTION_KILL_TAIL:
1950         _yr_re_fiber_kill_tail(&fibers, &context->re_fiber_pool, fiber);
1951         fiber = NULL;
1952         break;
1953 
1954       case ACTION_CONTINUE:
1955         FAIL_ON_ERROR_WITH_CLEANUP(
1956             _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
1957             _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1958         break;
1959 
1960       default:
1961         next_fiber = fiber->next;
1962         FAIL_ON_ERROR_WITH_CLEANUP(
1963             _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
1964             _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1965         fiber = next_fiber;
1966       }
1967     }
1968 
1969     input += input_incr;
1970     bytes_matched += character_size;
1971 
1972     if (flags & RE_FLAGS_SCAN && bytes_matched < max_bytes_matched)
1973     {
1974       FAIL_ON_ERROR_WITH_CLEANUP(
1975           _yr_re_fiber_create(&context->re_fiber_pool, &fiber),
1976           _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1977 
1978       fiber->ip = code;
1979       _yr_re_fiber_append(&fibers, fiber);
1980 
1981       FAIL_ON_ERROR_WITH_CLEANUP(
1982           _yr_re_fiber_sync(&fibers, &context->re_fiber_pool, fiber),
1983           _yr_re_fiber_kill_all(&fibers, &context->re_fiber_pool));
1984     }
1985   }
1986 
1987   return ERROR_SUCCESS;
1988 }
1989 
1990 ////////////////////////////////////////////////////////////////////////////////
1991 // This function replaces yr_re_exec for regular expressions marked with flag
1992 // RE_FLAGS_FAST_REGEXP. These are regular expression whose code contain only
1993 // the following operations: RE_OPCODE_LITERAL, RE_OPCODE_MASKED_LITERAL,
1994 // RE_OPCODE_ANY, RE_OPCODE_REPEAT_ANY_UNGREEDY and RE_OPCODE_MATCH. Some
1995 // examples of regular expressions that can be executed with this function are:
1996 //
1997 //  /foobar/
1998 //  /foo.*?bar/
1999 //
yr_re_fast_exec(YR_SCAN_CONTEXT * context,const uint8_t * code,const uint8_t * input_data,size_t input_forwards_size,size_t input_backwards_size,int flags,RE_MATCH_CALLBACK_FUNC callback,void * callback_args,int * matches)2000 int yr_re_fast_exec(
2001     YR_SCAN_CONTEXT* context,
2002     const uint8_t* code,
2003     const uint8_t* input_data,
2004     size_t input_forwards_size,
2005     size_t input_backwards_size,
2006     int flags,
2007     RE_MATCH_CALLBACK_FUNC callback,
2008     void* callback_args,
2009     int* matches)
2010 {
2011   RE_REPEAT_ANY_ARGS* repeat_any_args;
2012 
2013   const uint8_t* code_stack[YR_MAX_FAST_RE_STACK];
2014   const uint8_t* input_stack[YR_MAX_FAST_RE_STACK];
2015   int matches_stack[YR_MAX_FAST_RE_STACK];
2016 
2017   const uint8_t* input = input_data;
2018   const uint8_t* next_input;
2019   const uint8_t* ip = code;
2020   const uint8_t* next_opcode;
2021 
2022   uint8_t mask;
2023   uint8_t value;
2024 
2025   int stop;
2026   int input_incr;
2027   int sp = 0;
2028 
2029   int bytes_matched;
2030   int max_bytes_matched;
2031 
2032   max_bytes_matched = flags & RE_FLAGS_BACKWARDS ? (int) input_backwards_size
2033                                                  : (int) input_forwards_size;
2034 
2035   input_incr = flags & RE_FLAGS_BACKWARDS ? -1 : 1;
2036 
2037   if (flags & RE_FLAGS_BACKWARDS)
2038     input--;
2039 
2040   code_stack[sp] = code;
2041   input_stack[sp] = input;
2042   matches_stack[sp] = 0;
2043   sp++;
2044 
2045   while (sp > 0)
2046   {
2047     sp--;
2048     ip = code_stack[sp];
2049     input = input_stack[sp];
2050     bytes_matched = matches_stack[sp];
2051     stop = false;
2052 
2053     while (!stop)
2054     {
2055       if (*ip == RE_OPCODE_MATCH)
2056       {
2057         if (flags & RE_FLAGS_EXHAUSTIVE)
2058         {
2059           FAIL_ON_ERROR(callback(
2060               // When matching forwards the matching data always starts at
2061               // input_data, when matching backwards it starts at input + 1 or
2062               // input_data - input_backwards_size if input + 1 is outside the
2063               // buffer.
2064               flags & RE_FLAGS_BACKWARDS
2065                   ? yr_max(input + 1, input_data - input_backwards_size)
2066                   : input_data,
2067               // The number of matched bytes should not be larger than
2068               // max_bytes_matched.
2069               yr_min(bytes_matched, max_bytes_matched),
2070               flags,
2071               callback_args));
2072 
2073           break;
2074         }
2075         else
2076         {
2077           if (matches != NULL)
2078             *matches = bytes_matched;
2079 
2080           return ERROR_SUCCESS;
2081         }
2082       }
2083 
2084       if (bytes_matched >= max_bytes_matched)
2085         break;
2086 
2087       switch (*ip)
2088       {
2089       case RE_OPCODE_LITERAL:
2090 
2091         if (*input == *(ip + 1))
2092         {
2093           bytes_matched++;
2094           input += input_incr;
2095           ip += 2;
2096         }
2097         else
2098         {
2099           stop = true;
2100         }
2101 
2102         break;
2103 
2104       case RE_OPCODE_MASKED_LITERAL:
2105 
2106         value = *(int16_t*) (ip + 1) & 0xFF;
2107         mask = *(int16_t*) (ip + 1) >> 8;
2108 
2109         if ((*input & mask) == value)
2110         {
2111           bytes_matched++;
2112           input += input_incr;
2113           ip += 3;
2114         }
2115         else
2116         {
2117           stop = true;
2118         }
2119 
2120         break;
2121 
2122       case RE_OPCODE_ANY:
2123 
2124         bytes_matched++;
2125         input += input_incr;
2126         ip += 1;
2127 
2128         break;
2129 
2130       case RE_OPCODE_REPEAT_ANY_UNGREEDY:
2131 
2132         repeat_any_args = (RE_REPEAT_ANY_ARGS*) (ip + 1);
2133         next_opcode = ip + 1 + sizeof(RE_REPEAT_ANY_ARGS);
2134 
2135         for (int i = repeat_any_args->min + 1; i <= repeat_any_args->max; i++)
2136         {
2137           if (bytes_matched + i >= max_bytes_matched)
2138             break;
2139 
2140           next_input = input + i * input_incr;
2141 
2142           if (*(next_opcode) != RE_OPCODE_LITERAL ||
2143               (*(next_opcode) == RE_OPCODE_LITERAL &&
2144                *(next_opcode + 1) == *next_input))
2145           {
2146             if (sp >= YR_MAX_FAST_RE_STACK)
2147               return ERROR_TOO_MANY_RE_FIBERS;
2148 
2149             code_stack[sp] = next_opcode;
2150             input_stack[sp] = next_input;
2151             matches_stack[sp] = bytes_matched + i;
2152             sp++;
2153           }
2154         }
2155 
2156         input += input_incr * repeat_any_args->min;
2157         bytes_matched += repeat_any_args->min;
2158 
2159         ip = next_opcode;
2160 
2161         break;
2162 
2163       default:
2164         assert(false);
2165       }
2166     }
2167   }
2168 
2169   if (matches != NULL)
2170     *matches = -1;
2171 
2172   return ERROR_SUCCESS;
2173 }
2174 
_yr_re_print_node(RE_NODE * re_node,uint32_t indent)2175 static void _yr_re_print_node(RE_NODE* re_node, uint32_t indent)
2176 {
2177   RE_NODE* child;
2178   int i;
2179 
2180   if (re_node == NULL)
2181     return;
2182 
2183   if (indent > 0)
2184     printf("\n%*s", indent, " ");
2185   switch (re_node->type)
2186   {
2187   case RE_NODE_ALT:
2188     printf("Alt(");
2189     _yr_re_print_node(re_node->children_head, indent + 4);
2190     printf(",");
2191     _yr_re_print_node(re_node->children_tail, indent + 4);
2192     printf("\n%*s%s", indent, " ", ")");
2193     break;
2194 
2195   case RE_NODE_CONCAT:
2196     printf("Cat(");
2197     child = re_node->children_head;
2198     while (child != NULL)
2199     {
2200       _yr_re_print_node(child, indent + 4);
2201       printf(",");
2202       child = child->next_sibling;
2203     }
2204     printf("\n%*s%s", indent, " ", ")");
2205     break;
2206 
2207   case RE_NODE_STAR:
2208     printf("Star(");
2209     _yr_re_print_node(re_node->children_head, indent + 4);
2210     printf(")");
2211     break;
2212 
2213   case RE_NODE_PLUS:
2214     printf("Plus(");
2215     _yr_re_print_node(re_node->children_head, indent + 4);
2216     printf(")");
2217     break;
2218 
2219   case RE_NODE_LITERAL:
2220     printf("Lit(%c)", re_node->value);
2221     break;
2222 
2223   case RE_NODE_MASKED_LITERAL:
2224     printf("MaskedLit(%02X,%02X)", re_node->value, re_node->mask);
2225     break;
2226 
2227   case RE_NODE_WORD_CHAR:
2228     printf("WordChar");
2229     break;
2230 
2231   case RE_NODE_NON_WORD_CHAR:
2232     printf("NonWordChar");
2233     break;
2234 
2235   case RE_NODE_SPACE:
2236     printf("Space");
2237     break;
2238 
2239   case RE_NODE_NON_SPACE:
2240     printf("NonSpace");
2241     break;
2242 
2243   case RE_NODE_DIGIT:
2244     printf("Digit");
2245     break;
2246 
2247   case RE_NODE_NON_DIGIT:
2248     printf("NonDigit");
2249     break;
2250 
2251   case RE_NODE_ANY:
2252     printf("Any");
2253     break;
2254 
2255   case RE_NODE_RANGE:
2256     printf("Range(%d-%d, ", re_node->start, re_node->end);
2257     _yr_re_print_node(re_node->children_head, indent + 4);
2258     printf("\n%*s%s", indent, " ", ")");
2259     break;
2260 
2261   case RE_NODE_CLASS:
2262     printf("Class(");
2263     for (i = 0; i < 256; i++)
2264       if (_yr_re_is_char_in_class(re_node->re_class, i, false))
2265         printf("%02X,", i);
2266     printf(")");
2267     break;
2268 
2269   default:
2270     printf("???");
2271     break;
2272   }
2273 }
2274 
yr_re_print(RE_AST * re_ast)2275 void yr_re_print(RE_AST* re_ast)
2276 {
2277   _yr_re_print_node(re_ast->root_node, 0);
2278 }
2279