1 #include "tree_sitter/api.h"
2 #include "./alloc.h"
3 #include "./array.h"
4 #include "./language.h"
5 #include "./point.h"
6 #include "./tree_cursor.h"
7 #include "./unicode.h"
8 #include <wctype.h>
9 
10 // #define DEBUG_ANALYZE_QUERY
11 // #define LOG(...) fprintf(stderr, __VA_ARGS__)
12 #define LOG(...)
13 
14 #define MAX_STEP_CAPTURE_COUNT 3
15 #define MAX_STATE_PREDECESSOR_COUNT 100
16 #define MAX_ANALYSIS_STATE_DEPTH 12
17 #define MAX_NEGATED_FIELD_COUNT 8
18 
19 /*
20  * Stream - A sequence of unicode characters derived from a UTF8 string.
21  * This struct is used in parsing queries from S-expressions.
22  */
23 typedef struct {
24   const char *input;
25   const char *start;
26   const char *end;
27   int32_t next;
28   uint8_t next_size;
29 } Stream;
30 
31 /*
32  * QueryStep - A step in the process of matching a query. Each node within
33  * a query S-expression corresponds to one of these steps. An entire pattern
34  * is represented as a sequence of these steps. The basic properties of a
35  * node are represented by these fields:
36  * - `symbol` - The grammar symbol to match. A zero value represents the
37  *    wildcard symbol, '_'.
38  * - `field` - The field name to match. A zero value means that a field name
39  *    was not specified.
40  * - `capture_ids` - An array of integers representing the names of captures
41  *    associated with this node in the pattern, terminated by a `NONE` value.
42  * - `depth` - The depth where this node occurs in the pattern. The root node
43  *    of the pattern has depth zero.
44  *
45  * For simple patterns, steps are matched in sequential order. But in order to
46  * handle alternative/repeated/optional sub-patterns, query steps are not always
47  * structured as a linear sequence; they sometimes need to split and merge. This
48  * is done using the following fields:
49  *  - `alternative_index` - The index of a different query step that serves as
50  *    an alternative to this step. A `NONE` value represents no alternative.
51  *    When a query state reaches a step with an alternative index, the state
52  *    is duplicated, with one copy remaining at the original step, and one copy
53  *    moving to the alternative step. The alternative may have its own alternative
54  *    step, so this splitting is an iterative process.
55  * - `is_dead_end` - Indication that this state cannot be passed directly, and
56  *    exists only in order to redirect to an alternative index, with no splitting.
57  * - `is_pass_through` - Indication that state has no matching logic of its own,
58  *    and exists only to split a state. One copy of the state advances immediately
59  *    to the next step, and one moves to the alternative step.
60  *
61  * Steps have some additional fields in order to handle the `.` (or "anchor") operator,
62  * which forbids additional child nodes:
63  * - `is_immediate` - Indication that the node matching this step cannot be preceded
64  *   by other sibling nodes that weren't specified in the pattern.
65  * - `is_last_child` - Indicates that the node matching this step cannot have any
66  *   subsequent named siblings.
67  */
68 typedef struct {
69   TSSymbol symbol;
70   TSSymbol supertype_symbol;
71   TSFieldId field;
72   uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT];
73   uint16_t depth;
74   uint16_t alternative_index;
75   uint16_t negated_field_list_id;
76   bool contains_captures: 1;
77   bool is_immediate: 1;
78   bool is_last_child: 1;
79   bool is_pass_through: 1;
80   bool is_dead_end: 1;
81   bool alternative_is_immediate: 1;
82   bool is_definite: 1;
83 } QueryStep;
84 
85 /*
86  * Slice - A slice of an external array. Within a query, capture names,
87  * literal string values, and predicate step informations are stored in three
88  * contiguous arrays. Individual captures, string values, and predicates are
89  * represented as slices of these three arrays.
90  */
91 typedef struct {
92   uint32_t offset;
93   uint32_t length;
94 } Slice;
95 
96 /*
97  * SymbolTable - a two-way mapping of strings to ids.
98  */
99 typedef struct {
100   Array(char) characters;
101   Array(Slice) slices;
102 } SymbolTable;
103 
104 /*
105  * PatternEntry - Information about the starting point for matching a particular
106  * pattern. These entries are stored in a 'pattern map' - a sorted array that
107  * makes it possible to efficiently lookup patterns based on the symbol for their
108  * first step. The entry consists of the following fields:
109  * - `pattern_index` - the index of the pattern within the query
110  * - `step_index` - the index of the pattern's first step in the shared `steps` array
111  * - `is_rooted` - whether or not the pattern has a single root node. This property
112  *   affects decisions about whether or not to start the pattern for nodes outside
113  *   of a QueryCursor's range restriction.
114  */
115 typedef struct {
116   uint16_t step_index;
117   uint16_t pattern_index;
118   bool is_rooted;
119 } PatternEntry;
120 
121 typedef struct {
122   Slice steps;
123   Slice predicate_steps;
124   uint32_t start_byte;
125 } QueryPattern;
126 
127 typedef struct {
128   uint32_t byte_offset;
129   uint16_t step_index;
130 } StepOffset;
131 
132 /*
133  * QueryState - The state of an in-progress match of a particular pattern
134  * in a query. While executing, a `TSQueryCursor` must keep track of a number
135  * of possible in-progress matches. Each of those possible matches is
136  * represented as one of these states. Fields:
137  * - `id` - A numeric id that is exposed to the public API. This allows the
138  *    caller to remove a given match, preventing any more of its captures
139  *    from being returned.
140  * - `start_depth` - The depth in the tree where the first step of the state's
141  *    pattern was matched.
142  * - `pattern_index` - The pattern that the state is matching.
143  * - `consumed_capture_count` - The number of captures from this match that
144  *    have already been returned.
145  * - `capture_list_id` - A numeric id that can be used to retrieve the state's
146  *    list of captures from the `CaptureListPool`.
147  * - `seeking_immediate_match` - A flag that indicates that the state's next
148  *    step must be matched by the very next sibling. This is used when
149  *    processing repetitions.
150  * - `has_in_progress_alternatives` - A flag that indicates that there is are
151  *    other states that have the same captures as this state, but are at
152  *    different steps in their pattern. This means that in order to obey the
153  *    'longest-match' rule, this state should not be returned as a match until
154  *    it is clear that there can be no other alternative match with more captures.
155  */
156 typedef struct {
157   uint32_t id;
158   uint32_t capture_list_id;
159   uint16_t start_depth;
160   uint16_t step_index;
161   uint16_t pattern_index;
162   uint16_t consumed_capture_count: 12;
163   bool seeking_immediate_match: 1;
164   bool has_in_progress_alternatives: 1;
165   bool dead: 1;
166   bool needs_parent: 1;
167 } QueryState;
168 
169 typedef Array(TSQueryCapture) CaptureList;
170 
171 /*
172  * CaptureListPool - A collection of *lists* of captures. Each query state needs
173  * to maintain its own list of captures. To avoid repeated allocations, this struct
174  * maintains a fixed set of capture lists, and keeps track of which ones are
175  * currently in use by a query state.
176  */
177 typedef struct {
178   Array(CaptureList) list;
179   CaptureList empty_list;
180   // The maximum number of capture lists that we are allowed to allocate. We
181   // never allow `list` to allocate more entries than this, dropping pending
182   // matches if needed to stay under the limit.
183   uint32_t max_capture_list_count;
184   // The number of capture lists allocated in `list` that are not currently in
185   // use. We reuse those existing-but-unused capture lists before trying to
186   // allocate any new ones. We use an invalid value (UINT32_MAX) for a capture
187   // list's length to indicate that it's not in use.
188   uint32_t free_capture_list_count;
189 } CaptureListPool;
190 
191 /*
192  * AnalysisState - The state needed for walking the parse table when analyzing
193  * a query pattern, to determine at which steps the pattern might fail to match.
194  */
195 typedef struct {
196   TSStateId parse_state;
197   TSSymbol parent_symbol;
198   uint16_t child_index;
199   TSFieldId field_id: 15;
200   bool done: 1;
201 } AnalysisStateEntry;
202 
203 typedef struct {
204   AnalysisStateEntry stack[MAX_ANALYSIS_STATE_DEPTH];
205   uint16_t depth;
206   uint16_t step_index;
207 } AnalysisState;
208 
209 typedef Array(AnalysisState) AnalysisStateSet;
210 
211 /*
212  * AnalysisSubgraph - A subset of the states in the parse table that are used
213  * in constructing nodes with a certain symbol. Each state is accompanied by
214  * some information about the possible node that could be produced in
215  * downstream states.
216  */
217 typedef struct {
218   TSStateId state;
219   uint8_t production_id;
220   uint8_t child_index: 7;
221   bool done: 1;
222 } AnalysisSubgraphNode;
223 
224 typedef struct {
225   TSSymbol symbol;
226   Array(TSStateId) start_states;
227   Array(AnalysisSubgraphNode) nodes;
228 } AnalysisSubgraph;
229 
230 /*
231  * StatePredecessorMap - A map that stores the predecessors of each parse state.
232  * This is used during query analysis to determine which parse states can lead
233  * to which reduce actions.
234  */
235 typedef struct {
236   TSStateId *contents;
237 } StatePredecessorMap;
238 
239 /*
240  * TSQuery - A tree query, compiled from a string of S-expressions. The query
241  * itself is immutable. The mutable state used in the process of executing the
242  * query is stored in a `TSQueryCursor`.
243  */
244 struct TSQuery {
245   SymbolTable captures;
246   SymbolTable predicate_values;
247   Array(QueryStep) steps;
248   Array(PatternEntry) pattern_map;
249   Array(TSQueryPredicateStep) predicate_steps;
250   Array(QueryPattern) patterns;
251   Array(StepOffset) step_offsets;
252   Array(TSFieldId) negated_fields;
253   Array(char) string_buffer;
254   const TSLanguage *language;
255   uint16_t wildcard_root_pattern_count;
256 };
257 
258 /*
259  * TSQueryCursor - A stateful struct used to execute a query on a tree.
260  */
261 struct TSQueryCursor {
262   const TSQuery *query;
263   TSTreeCursor cursor;
264   Array(QueryState) states;
265   Array(QueryState) finished_states;
266   CaptureListPool capture_list_pool;
267   uint32_t depth;
268   uint32_t start_byte;
269   uint32_t end_byte;
270   TSPoint start_point;
271   TSPoint end_point;
272   uint32_t next_state_id;
273   bool ascending;
274   bool halted;
275   bool did_exceed_match_limit;
276 };
277 
278 static const TSQueryError PARENT_DONE = -1;
279 static const uint16_t PATTERN_DONE_MARKER = UINT16_MAX;
280 static const uint16_t NONE = UINT16_MAX;
281 static const TSSymbol WILDCARD_SYMBOL = 0;
282 static const TSSymbol NAMED_WILDCARD_SYMBOL = UINT16_MAX - 1;
283 
284 /**********
285  * Stream
286  **********/
287 
288 // Advance to the next unicode code point in the stream.
stream_advance(Stream * self)289 static bool stream_advance(Stream *self) {
290   self->input += self->next_size;
291   if (self->input < self->end) {
292     uint32_t size = ts_decode_utf8(
293       (const uint8_t *)self->input,
294       self->end - self->input,
295       &self->next
296     );
297     if (size > 0) {
298       self->next_size = size;
299       return true;
300     }
301   } else {
302     self->next_size = 0;
303     self->next = '\0';
304   }
305   return false;
306 }
307 
308 // Reset the stream to the given input position, represented as a pointer
309 // into the input string.
stream_reset(Stream * self,const char * input)310 static void stream_reset(Stream *self, const char *input) {
311   self->input = input;
312   self->next_size = 0;
313   stream_advance(self);
314 }
315 
stream_new(const char * string,uint32_t length)316 static Stream stream_new(const char *string, uint32_t length) {
317   Stream self = {
318     .next = 0,
319     .input = string,
320     .start = string,
321     .end = string + length,
322   };
323   stream_advance(&self);
324   return self;
325 }
326 
stream_skip_whitespace(Stream * self)327 static void stream_skip_whitespace(Stream *self) {
328   for (;;) {
329     if (iswspace(self->next)) {
330       stream_advance(self);
331     } else if (self->next == ';') {
332       // skip over comments
333       stream_advance(self);
334       while (self->next && self->next != '\n') {
335         if (!stream_advance(self)) break;
336       }
337     } else {
338       break;
339     }
340   }
341 }
342 
stream_is_ident_start(Stream * self)343 static bool stream_is_ident_start(Stream *self) {
344   return iswalnum(self->next) || self->next == '_' || self->next == '-';
345 }
346 
stream_scan_identifier(Stream * stream)347 static void stream_scan_identifier(Stream *stream) {
348   do {
349     stream_advance(stream);
350   } while (
351     iswalnum(stream->next) ||
352     stream->next == '_' ||
353     stream->next == '-' ||
354     stream->next == '.' ||
355     stream->next == '?' ||
356     stream->next == '!'
357   );
358 }
359 
stream_offset(Stream * self)360 static uint32_t stream_offset(Stream *self) {
361   return self->input - self->start;
362 }
363 
364 /******************
365  * CaptureListPool
366  ******************/
367 
capture_list_pool_new(void)368 static CaptureListPool capture_list_pool_new(void) {
369   return (CaptureListPool) {
370     .list = array_new(),
371     .empty_list = array_new(),
372     .max_capture_list_count = UINT32_MAX,
373     .free_capture_list_count = 0,
374   };
375 }
376 
capture_list_pool_reset(CaptureListPool * self)377 static void capture_list_pool_reset(CaptureListPool *self) {
378   for (uint16_t i = 0; i < self->list.size; i++) {
379     // This invalid size means that the list is not in use.
380     self->list.contents[i].size = UINT32_MAX;
381   }
382   self->free_capture_list_count = self->list.size;
383 }
384 
capture_list_pool_delete(CaptureListPool * self)385 static void capture_list_pool_delete(CaptureListPool *self) {
386   for (uint16_t i = 0; i < self->list.size; i++) {
387     array_delete(&self->list.contents[i]);
388   }
389   array_delete(&self->list);
390 }
391 
capture_list_pool_get(const CaptureListPool * self,uint16_t id)392 static const CaptureList *capture_list_pool_get(const CaptureListPool *self, uint16_t id) {
393   if (id >= self->list.size) return &self->empty_list;
394   return &self->list.contents[id];
395 }
396 
capture_list_pool_get_mut(CaptureListPool * self,uint16_t id)397 static CaptureList *capture_list_pool_get_mut(CaptureListPool *self, uint16_t id) {
398   assert(id < self->list.size);
399   return &self->list.contents[id];
400 }
401 
capture_list_pool_is_empty(const CaptureListPool * self)402 static bool capture_list_pool_is_empty(const CaptureListPool *self) {
403   // The capture list pool is empty if all allocated lists are in use, and we
404   // have reached the maximum allowed number of allocated lists.
405   return self->free_capture_list_count == 0 && self->list.size >= self->max_capture_list_count;
406 }
407 
capture_list_pool_acquire(CaptureListPool * self)408 static uint16_t capture_list_pool_acquire(CaptureListPool *self) {
409   // First see if any already allocated capture list is currently unused.
410   if (self->free_capture_list_count > 0) {
411     for (uint16_t i = 0; i < self->list.size; i++) {
412       if (self->list.contents[i].size == UINT32_MAX) {
413         array_clear(&self->list.contents[i]);
414         self->free_capture_list_count--;
415         return i;
416       }
417     }
418   }
419 
420   // Otherwise allocate and initialize a new capture list, as long as that
421   // doesn't put us over the requested maximum.
422   uint32_t i = self->list.size;
423   if (i >= self->max_capture_list_count) {
424     return NONE;
425   }
426   CaptureList list;
427   array_init(&list);
428   array_push(&self->list, list);
429   return i;
430 }
431 
capture_list_pool_release(CaptureListPool * self,uint16_t id)432 static void capture_list_pool_release(CaptureListPool *self, uint16_t id) {
433   if (id >= self->list.size) return;
434   self->list.contents[id].size = UINT32_MAX;
435   self->free_capture_list_count++;
436 }
437 
438 /**************
439  * SymbolTable
440  **************/
441 
symbol_table_new(void)442 static SymbolTable symbol_table_new(void) {
443   return (SymbolTable) {
444     .characters = array_new(),
445     .slices = array_new(),
446   };
447 }
448 
symbol_table_delete(SymbolTable * self)449 static void symbol_table_delete(SymbolTable *self) {
450   array_delete(&self->characters);
451   array_delete(&self->slices);
452 }
453 
symbol_table_id_for_name(const SymbolTable * self,const char * name,uint32_t length)454 static int symbol_table_id_for_name(
455   const SymbolTable *self,
456   const char *name,
457   uint32_t length
458 ) {
459   for (unsigned i = 0; i < self->slices.size; i++) {
460     Slice slice = self->slices.contents[i];
461     if (
462       slice.length == length &&
463       !strncmp(&self->characters.contents[slice.offset], name, length)
464     ) return i;
465   }
466   return -1;
467 }
468 
symbol_table_name_for_id(const SymbolTable * self,uint16_t id,uint32_t * length)469 static const char *symbol_table_name_for_id(
470   const SymbolTable *self,
471   uint16_t id,
472   uint32_t *length
473 ) {
474   Slice slice = self->slices.contents[id];
475   *length = slice.length;
476   return &self->characters.contents[slice.offset];
477 }
478 
symbol_table_insert_name(SymbolTable * self,const char * name,uint32_t length)479 static uint16_t symbol_table_insert_name(
480   SymbolTable *self,
481   const char *name,
482   uint32_t length
483 ) {
484   int id = symbol_table_id_for_name(self, name, length);
485   if (id >= 0) return (uint16_t)id;
486   Slice slice = {
487     .offset = self->characters.size,
488     .length = length,
489   };
490   array_grow_by(&self->characters, length + 1);
491   memcpy(&self->characters.contents[slice.offset], name, length);
492   self->characters.contents[self->characters.size - 1] = 0;
493   array_push(&self->slices, slice);
494   return self->slices.size - 1;
495 }
496 
497 /************
498  * QueryStep
499  ************/
500 
query_step__new(TSSymbol symbol,uint16_t depth,bool is_immediate)501 static QueryStep query_step__new(
502   TSSymbol symbol,
503   uint16_t depth,
504   bool is_immediate
505 ) {
506   return (QueryStep) {
507     .symbol = symbol,
508     .depth = depth,
509     .field = 0,
510     .capture_ids = {NONE, NONE, NONE},
511     .alternative_index = NONE,
512     .negated_field_list_id = 0,
513     .contains_captures = false,
514     .is_last_child = false,
515     .is_pass_through = false,
516     .is_dead_end = false,
517     .is_definite = false,
518     .is_immediate = is_immediate,
519     .alternative_is_immediate = false,
520   };
521 }
522 
query_step__add_capture(QueryStep * self,uint16_t capture_id)523 static void query_step__add_capture(QueryStep *self, uint16_t capture_id) {
524   for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) {
525     if (self->capture_ids[i] == NONE) {
526       self->capture_ids[i] = capture_id;
527       break;
528     }
529   }
530 }
531 
query_step__remove_capture(QueryStep * self,uint16_t capture_id)532 static void query_step__remove_capture(QueryStep *self, uint16_t capture_id) {
533   for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) {
534     if (self->capture_ids[i] == capture_id) {
535       self->capture_ids[i] = NONE;
536       while (i + 1 < MAX_STEP_CAPTURE_COUNT) {
537         if (self->capture_ids[i + 1] == NONE) break;
538         self->capture_ids[i] = self->capture_ids[i + 1];
539         self->capture_ids[i + 1] = NONE;
540         i++;
541       }
542       break;
543     }
544   }
545 }
546 
547 /**********************
548  * StatePredecessorMap
549  **********************/
550 
state_predecessor_map_new(const TSLanguage * language)551 static inline StatePredecessorMap state_predecessor_map_new(const TSLanguage *language) {
552   return (StatePredecessorMap) {
553     .contents = ts_calloc(language->state_count * (MAX_STATE_PREDECESSOR_COUNT + 1), sizeof(TSStateId)),
554   };
555 }
556 
state_predecessor_map_delete(StatePredecessorMap * self)557 static inline void state_predecessor_map_delete(StatePredecessorMap *self) {
558   ts_free(self->contents);
559 }
560 
state_predecessor_map_add(StatePredecessorMap * self,TSStateId state,TSStateId predecessor)561 static inline void state_predecessor_map_add(
562   StatePredecessorMap *self,
563   TSStateId state,
564   TSStateId predecessor
565 ) {
566   unsigned index = state * (MAX_STATE_PREDECESSOR_COUNT + 1);
567   TSStateId *count = &self->contents[index];
568   if (*count == 0 || (*count < MAX_STATE_PREDECESSOR_COUNT && self->contents[index + *count] != predecessor)) {
569     (*count)++;
570     self->contents[index + *count] = predecessor;
571   }
572 }
573 
state_predecessor_map_get(const StatePredecessorMap * self,TSStateId state,unsigned * count)574 static inline const TSStateId *state_predecessor_map_get(
575   const StatePredecessorMap *self,
576   TSStateId state,
577   unsigned *count
578 ) {
579   unsigned index = state * (MAX_STATE_PREDECESSOR_COUNT + 1);
580   *count = self->contents[index];
581   return &self->contents[index + 1];
582 }
583 
584 /****************
585  * AnalysisState
586  ****************/
587 
analysis_state__recursion_depth(const AnalysisState * self)588 static unsigned analysis_state__recursion_depth(const AnalysisState *self) {
589   unsigned result = 0;
590   for (unsigned i = 0; i < self->depth; i++) {
591     TSSymbol symbol = self->stack[i].parent_symbol;
592     for (unsigned j = 0; j < i; j++) {
593       if (self->stack[j].parent_symbol == symbol) {
594         result++;
595         break;
596       }
597     }
598   }
599   return result;
600 }
601 
analysis_state__compare_position(const AnalysisState * self,const AnalysisState * other)602 static inline int analysis_state__compare_position(
603   const AnalysisState *self,
604   const AnalysisState *other
605 ) {
606   for (unsigned i = 0; i < self->depth; i++) {
607     if (i >= other->depth) return -1;
608     if (self->stack[i].child_index < other->stack[i].child_index) return -1;
609     if (self->stack[i].child_index > other->stack[i].child_index) return 1;
610   }
611   if (self->depth < other->depth) return 1;
612   return 0;
613 }
614 
analysis_state__compare(const AnalysisState * self,const AnalysisState * other)615 static inline int analysis_state__compare(
616   const AnalysisState *self,
617   const AnalysisState *other
618 ) {
619   int result = analysis_state__compare_position(self, other);
620   if (result != 0) return result;
621   for (unsigned i = 0; i < self->depth; i++) {
622     if (self->stack[i].parent_symbol < other->stack[i].parent_symbol) return -1;
623     if (self->stack[i].parent_symbol > other->stack[i].parent_symbol) return 1;
624     if (self->stack[i].parse_state < other->stack[i].parse_state) return -1;
625     if (self->stack[i].parse_state > other->stack[i].parse_state) return 1;
626     if (self->stack[i].field_id < other->stack[i].field_id) return -1;
627     if (self->stack[i].field_id > other->stack[i].field_id) return 1;
628   }
629   if (self->step_index < other->step_index) return -1;
630   if (self->step_index > other->step_index) return 1;
631   return 0;
632 }
633 
analysis_state__top(AnalysisState * self)634 static inline AnalysisStateEntry *analysis_state__top(AnalysisState *self) {
635   return &self->stack[self->depth - 1];
636 }
637 
analysis_state__has_supertype(AnalysisState * self,TSSymbol symbol)638 static inline bool analysis_state__has_supertype(AnalysisState *self, TSSymbol symbol) {
639   for (unsigned i = 0; i < self->depth; i++) {
640     if (self->stack[i].parent_symbol == symbol) return true;
641   }
642   return false;
643 }
644 
645 /***********************
646  * AnalysisSubgraphNode
647  ***********************/
648 
analysis_subgraph_node__compare(const AnalysisSubgraphNode * self,const AnalysisSubgraphNode * other)649 static inline int analysis_subgraph_node__compare(const AnalysisSubgraphNode *self, const AnalysisSubgraphNode *other) {
650   if (self->state < other->state) return -1;
651   if (self->state > other->state) return 1;
652   if (self->child_index < other->child_index) return -1;
653   if (self->child_index > other->child_index) return 1;
654   if (self->done < other->done) return -1;
655   if (self->done > other->done) return 1;
656   if (self->production_id < other->production_id) return -1;
657   if (self->production_id > other->production_id) return 1;
658   return 0;
659 }
660 
661 /*********
662  * Query
663  *********/
664 
665 // The `pattern_map` contains a mapping from TSSymbol values to indices in the
666 // `steps` array. For a given syntax node, the `pattern_map` makes it possible
667 // to quickly find the starting steps of all of the patterns whose root matches
668 // that node. Each entry has two fields: a `pattern_index`, which identifies one
669 // of the patterns in the query, and a `step_index`, which indicates the start
670 // offset of that pattern's steps within the `steps` array.
671 //
672 // The entries are sorted by the patterns' root symbols, and lookups use a
673 // binary search. This ensures that the cost of this initial lookup step
674 // scales logarithmically with the number of patterns in the query.
675 //
676 // This returns `true` if the symbol is present and `false` otherwise.
677 // If the symbol is not present `*result` is set to the index where the
678 // symbol should be inserted.
ts_query__pattern_map_search(const TSQuery * self,TSSymbol needle,uint32_t * result)679 static inline bool ts_query__pattern_map_search(
680   const TSQuery *self,
681   TSSymbol needle,
682   uint32_t *result
683 ) {
684   uint32_t base_index = self->wildcard_root_pattern_count;
685   uint32_t size = self->pattern_map.size - base_index;
686   if (size == 0) {
687     *result = base_index;
688     return false;
689   }
690   while (size > 1) {
691     uint32_t half_size = size / 2;
692     uint32_t mid_index = base_index + half_size;
693     TSSymbol mid_symbol = self->steps.contents[
694       self->pattern_map.contents[mid_index].step_index
695     ].symbol;
696     if (needle > mid_symbol) base_index = mid_index;
697     size -= half_size;
698   }
699 
700   TSSymbol symbol = self->steps.contents[
701     self->pattern_map.contents[base_index].step_index
702   ].symbol;
703 
704   if (needle > symbol) {
705     base_index++;
706     if (base_index < self->pattern_map.size) {
707       symbol = self->steps.contents[
708         self->pattern_map.contents[base_index].step_index
709       ].symbol;
710     }
711   }
712 
713   *result = base_index;
714   return needle == symbol;
715 }
716 
717 // Insert a new pattern's start index into the pattern map, maintaining
718 // the pattern map's ordering invariant.
ts_query__pattern_map_insert(TSQuery * self,TSSymbol symbol,PatternEntry new_entry)719 static inline void ts_query__pattern_map_insert(
720   TSQuery *self,
721   TSSymbol symbol,
722   PatternEntry new_entry
723 ) {
724   uint32_t index;
725   ts_query__pattern_map_search(self, symbol, &index);
726 
727   // Ensure that the entries are sorted not only by symbol, but also
728   // by pattern_index. This way, states for earlier patterns will be
729   // initiated first, which allows the ordering of the states array
730   // to be maintained more efficiently.
731   while (index < self->pattern_map.size) {
732     PatternEntry *entry = &self->pattern_map.contents[index];
733     if (
734       self->steps.contents[entry->step_index].symbol == symbol &&
735       entry->pattern_index < new_entry.pattern_index
736     ) {
737       index++;
738     } else {
739       break;
740     }
741   }
742 
743   array_insert(&self->pattern_map, index, new_entry);
744 }
745 
ts_query__analyze_patterns(TSQuery * self,unsigned * error_offset)746 static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) {
747   // Identify all of the patterns in the query that have child patterns, both at the
748   // top level and nested within other larger patterns. Record the step index where
749   // each pattern starts.
750   Array(uint32_t) parent_step_indices = array_new();
751   for (unsigned i = 0; i < self->steps.size; i++) {
752     QueryStep *step = &self->steps.contents[i];
753     if (i + 1 < self->steps.size) {
754       QueryStep *next_step = &self->steps.contents[i + 1];
755       if (
756         step->symbol != WILDCARD_SYMBOL &&
757         step->symbol != NAMED_WILDCARD_SYMBOL &&
758         next_step->depth > step->depth &&
759         next_step->depth != PATTERN_DONE_MARKER
760       ) {
761         array_push(&parent_step_indices, i);
762       }
763     }
764     if (step->depth > 0) {
765       step->is_definite = true;
766     }
767   }
768 
769   // For every parent symbol in the query, initialize an 'analysis subgraph'.
770   // This subgraph lists all of the states in the parse table that are directly
771   // involved in building subtrees for this symbol.
772   //
773   // In addition to the parent symbols in the query, construct subgraphs for all
774   // of the hidden symbols in the grammar, because these might occur within
775   // one of the parent nodes, such that their children appear to belong to the
776   // parent.
777   Array(AnalysisSubgraph) subgraphs = array_new();
778   for (unsigned i = 0; i < parent_step_indices.size; i++) {
779     uint32_t parent_step_index = parent_step_indices.contents[i];
780     TSSymbol parent_symbol = self->steps.contents[parent_step_index].symbol;
781     AnalysisSubgraph subgraph = { .symbol = parent_symbol };
782     array_insert_sorted_by(&subgraphs, .symbol, subgraph);
783   }
784   for (TSSymbol sym = self->language->token_count; sym < self->language->symbol_count; sym++) {
785     if (!ts_language_symbol_metadata(self->language, sym).visible) {
786       AnalysisSubgraph subgraph = { .symbol = sym };
787       array_insert_sorted_by(&subgraphs, .symbol, subgraph);
788     }
789   }
790 
791   // Scan the parse table to find the data needed to populate these subgraphs.
792   // Collect three things during this scan:
793   //   1) All of the parse states where one of these symbols can start.
794   //   2) All of the parse states where one of these symbols can end, along
795   //      with information about the node that would be created.
796   //   3) A list of predecessor states for each state.
797   StatePredecessorMap predecessor_map = state_predecessor_map_new(self->language);
798   for (TSStateId state = 1; state < self->language->state_count; state++) {
799     unsigned subgraph_index, exists;
800     LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, state);
801     while (ts_lookahead_iterator_next(&lookahead_iterator)) {
802       if (lookahead_iterator.action_count) {
803         for (unsigned i = 0; i < lookahead_iterator.action_count; i++) {
804           const TSParseAction *action = &lookahead_iterator.actions[i];
805           if (action->type == TSParseActionTypeReduce) {
806             const TSSymbol *aliases, *aliases_end;
807             ts_language_aliases_for_symbol(
808               self->language,
809               action->reduce.symbol,
810               &aliases,
811               &aliases_end
812             );
813             for (const TSSymbol *symbol = aliases; symbol < aliases_end; symbol++) {
814               array_search_sorted_by(
815                 &subgraphs,
816                 .symbol,
817                 *symbol,
818                 &subgraph_index,
819                 &exists
820               );
821               if (exists) {
822                 AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
823                 if (subgraph->nodes.size == 0 || array_back(&subgraph->nodes)->state != state) {
824                   array_push(&subgraph->nodes, ((AnalysisSubgraphNode) {
825                     .state = state,
826                     .production_id = action->reduce.production_id,
827                     .child_index = action->reduce.child_count,
828                     .done = true,
829                   }));
830                 }
831               }
832             }
833           } else if (action->type == TSParseActionTypeShift && !action->shift.extra) {
834             TSStateId next_state = action->shift.state;
835             state_predecessor_map_add(&predecessor_map, next_state, state);
836           }
837         }
838       } else if (lookahead_iterator.next_state != 0) {
839         if (lookahead_iterator.next_state != state) {
840           state_predecessor_map_add(&predecessor_map, lookahead_iterator.next_state, state);
841         }
842         const TSSymbol *aliases, *aliases_end;
843         ts_language_aliases_for_symbol(
844           self->language,
845           lookahead_iterator.symbol,
846           &aliases,
847           &aliases_end
848         );
849         for (const TSSymbol *symbol = aliases; symbol < aliases_end; symbol++) {
850           array_search_sorted_by(
851             &subgraphs,
852             .symbol,
853             *symbol,
854             &subgraph_index,
855             &exists
856           );
857           if (exists) {
858             AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
859             if (
860               subgraph->start_states.size == 0 ||
861               *array_back(&subgraph->start_states) != state
862             )
863             array_push(&subgraph->start_states, state);
864           }
865         }
866       }
867     }
868   }
869 
870   // For each subgraph, compute the preceding states by walking backward
871   // from the end states using the predecessor map.
872   Array(AnalysisSubgraphNode) next_nodes = array_new();
873   for (unsigned i = 0; i < subgraphs.size; i++) {
874     AnalysisSubgraph *subgraph = &subgraphs.contents[i];
875     if (subgraph->nodes.size == 0) {
876       array_delete(&subgraph->start_states);
877       array_erase(&subgraphs, i);
878       i--;
879       continue;
880     }
881     array_assign(&next_nodes, &subgraph->nodes);
882     while (next_nodes.size > 0) {
883       AnalysisSubgraphNode node = array_pop(&next_nodes);
884       if (node.child_index > 1) {
885         unsigned predecessor_count;
886         const TSStateId *predecessors = state_predecessor_map_get(
887           &predecessor_map,
888           node.state,
889           &predecessor_count
890         );
891         for (unsigned j = 0; j < predecessor_count; j++) {
892           AnalysisSubgraphNode predecessor_node = {
893             .state = predecessors[j],
894             .child_index = node.child_index - 1,
895             .production_id = node.production_id,
896             .done = false,
897           };
898           unsigned index, exists;
899           array_search_sorted_with(
900             &subgraph->nodes, analysis_subgraph_node__compare, &predecessor_node,
901             &index, &exists
902           );
903           if (!exists) {
904             array_insert(&subgraph->nodes, index, predecessor_node);
905             array_push(&next_nodes, predecessor_node);
906           }
907         }
908       }
909     }
910   }
911 
912   #ifdef DEBUG_ANALYZE_QUERY
913     printf("\nSubgraphs:\n");
914     for (unsigned i = 0; i < subgraphs.size; i++) {
915       AnalysisSubgraph *subgraph = &subgraphs.contents[i];
916       printf("  %u, %s:\n", subgraph->symbol, ts_language_symbol_name(self->language, subgraph->symbol));
917       for (unsigned j = 0; j < subgraph->start_states.size; j++) {
918         printf(
919           "    {state: %u}\n",
920           subgraph->start_states.contents[j]
921         );
922       }
923       for (unsigned j = 0; j < subgraph->nodes.size; j++) {
924         AnalysisSubgraphNode *node = &subgraph->nodes.contents[j];
925         printf(
926           "    {state: %u, child_index: %u, production_id: %u, done: %d}\n",
927           node->state, node->child_index, node->production_id, node->done
928         );
929       }
930       printf("\n");
931     }
932   #endif
933 
934   // For each non-terminal pattern, determine if the pattern can successfully match,
935   // and identify all of the possible children within the pattern where matching could fail.
936   bool all_patterns_are_valid = true;
937   AnalysisStateSet states = array_new();
938   AnalysisStateSet next_states = array_new();
939   AnalysisStateSet deeper_states = array_new();
940   Array(uint16_t) final_step_indices = array_new();
941   for (unsigned i = 0; i < parent_step_indices.size; i++) {
942     uint16_t parent_step_index = parent_step_indices.contents[i];
943     uint16_t parent_depth = self->steps.contents[parent_step_index].depth;
944     TSSymbol parent_symbol = self->steps.contents[parent_step_index].symbol;
945     if (parent_symbol == ts_builtin_sym_error) continue;
946 
947     // Find the subgraph that corresponds to this pattern's root symbol. If the pattern's
948     // root symbols is not a non-terminal, then return an error.
949     unsigned subgraph_index, exists;
950     array_search_sorted_by(&subgraphs, .symbol, parent_symbol, &subgraph_index, &exists);
951     if (!exists) {
952       unsigned first_child_step_index = parent_step_index + 1;
953       uint32_t i, exists;
954       array_search_sorted_by(&self->step_offsets, .step_index, first_child_step_index, &i, &exists);
955       assert(exists);
956       *error_offset = self->step_offsets.contents[i].byte_offset;
957       all_patterns_are_valid = false;
958       break;
959     }
960 
961     // Initialize an analysis state at every parse state in the table where
962     // this parent symbol can occur.
963     AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
964     array_clear(&states);
965     array_clear(&deeper_states);
966     for (unsigned j = 0; j < subgraph->start_states.size; j++) {
967       TSStateId parse_state = subgraph->start_states.contents[j];
968       array_push(&states, ((AnalysisState) {
969         .step_index = parent_step_index + 1,
970         .stack = {
971           [0] = {
972             .parse_state = parse_state,
973             .parent_symbol = parent_symbol,
974             .child_index = 0,
975             .field_id = 0,
976             .done = false,
977           },
978         },
979         .depth = 1,
980       }));
981     }
982 
983     // Walk the subgraph for this non-terminal, tracking all of the possible
984     // sequences of progress within the pattern.
985     bool can_finish_pattern = false;
986     bool did_exceed_max_depth = false;
987     unsigned recursion_depth_limit = 0;
988     unsigned prev_final_step_count = 0;
989     array_clear(&final_step_indices);
990     for (;;) {
991       #ifdef DEBUG_ANALYZE_QUERY
992         printf("Final step indices:");
993         for (unsigned j = 0; j < final_step_indices.size; j++) {
994           printf(" %4u", final_step_indices.contents[j]);
995         }
996         printf("\nWalk states for %u %s:\n", i, ts_language_symbol_name(self->language, parent_symbol));
997         for (unsigned j = 0; j < states.size; j++) {
998           AnalysisState *state = &states.contents[j];
999           printf("  %3u: step: %u, stack: [", j, state->step_index);
1000           for (unsigned k = 0; k < state->depth; k++) {
1001             printf(
1002               " {%s, child: %u, state: %4u",
1003               self->language->symbol_names[state->stack[k].parent_symbol],
1004               state->stack[k].child_index,
1005               state->stack[k].parse_state
1006             );
1007             if (state->stack[k].field_id) printf(", field: %s", self->language->field_names[state->stack[k].field_id]);
1008             if (state->stack[k].done) printf(", DONE");
1009             printf("}");
1010           }
1011           printf(" ]\n");
1012         }
1013       #endif
1014 
1015       // If no further progress can be made within the current recursion depth limit, then
1016       // bump the depth limit by one, and continue to process the states the exceeded the
1017       // limit. But only allow this if progress has been made since the last time the depth
1018       // limit was increased.
1019       if (states.size == 0) {
1020         if (deeper_states.size > 0 && final_step_indices.size > prev_final_step_count) {
1021           #ifdef DEBUG_ANALYZE_QUERY
1022             printf("Increase recursion depth limit to %u\n", recursion_depth_limit + 1);
1023           #endif
1024 
1025           prev_final_step_count = final_step_indices.size;
1026           recursion_depth_limit++;
1027           AnalysisStateSet _states = states;
1028           states = deeper_states;
1029           deeper_states = _states;
1030           continue;
1031         }
1032 
1033         break;
1034       }
1035 
1036       array_clear(&next_states);
1037       for (unsigned j = 0; j < states.size; j++) {
1038         AnalysisState * const state = &states.contents[j];
1039 
1040         // For efficiency, it's important to avoid processing the same analysis state more
1041         // than once. To achieve this, keep the states in order of ascending position within
1042         // their hypothetical syntax trees. In each iteration of this loop, start by advancing
1043         // the states that have made the least progress. Avoid advancing states that have already
1044         // made more progress.
1045         if (next_states.size > 0) {
1046           int comparison = analysis_state__compare_position(state, array_back(&next_states));
1047           if (comparison == 0) {
1048             array_insert_sorted_with(&next_states, analysis_state__compare, *state);
1049             continue;
1050           } else if (comparison > 0) {
1051             while (j < states.size) {
1052               array_push(&next_states, states.contents[j]);
1053               j++;
1054             }
1055             break;
1056           }
1057         }
1058 
1059         const TSStateId parse_state = analysis_state__top(state)->parse_state;
1060         const TSSymbol parent_symbol = analysis_state__top(state)->parent_symbol;
1061         const TSFieldId parent_field_id = analysis_state__top(state)->field_id;
1062         const unsigned child_index = analysis_state__top(state)->child_index;
1063         const QueryStep * const step = &self->steps.contents[state->step_index];
1064 
1065         unsigned subgraph_index, exists;
1066         array_search_sorted_by(&subgraphs, .symbol, parent_symbol, &subgraph_index, &exists);
1067         if (!exists) continue;
1068         const AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
1069 
1070         // Follow every possible path in the parse table, but only visit states that
1071         // are part of the subgraph for the current symbol.
1072         LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, parse_state);
1073         while (ts_lookahead_iterator_next(&lookahead_iterator)) {
1074           TSSymbol sym = lookahead_iterator.symbol;
1075 
1076           TSStateId next_parse_state;
1077           if (lookahead_iterator.action_count) {
1078             const TSParseAction *action = &lookahead_iterator.actions[lookahead_iterator.action_count - 1];
1079             if (action->type == TSParseActionTypeShift) {
1080               next_parse_state = action->shift.extra ? parse_state : action->shift.state;
1081             } else {
1082               continue;
1083             }
1084           } else if (lookahead_iterator.next_state != 0) {
1085             next_parse_state = lookahead_iterator.next_state;
1086           } else {
1087             continue;
1088           }
1089 
1090           AnalysisSubgraphNode successor = {
1091             .state = next_parse_state,
1092             .child_index = child_index + 1,
1093           };
1094           unsigned node_index;
1095           array_search_sorted_with(
1096             &subgraph->nodes,
1097             analysis_subgraph_node__compare, &successor,
1098             &node_index, &exists
1099           );
1100           while (node_index < subgraph->nodes.size) {
1101             AnalysisSubgraphNode *node = &subgraph->nodes.contents[node_index++];
1102             if (node->state != successor.state || node->child_index != successor.child_index) break;
1103 
1104             // Use the subgraph to determine what alias and field will eventually be applied
1105             // to this child node.
1106             TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index);
1107             TSSymbol visible_symbol = alias
1108               ? alias
1109               : self->language->symbol_metadata[sym].visible
1110                 ? self->language->public_symbol_map[sym]
1111                 : 0;
1112             TSFieldId field_id = parent_field_id;
1113             if (!field_id) {
1114               const TSFieldMapEntry *field_map, *field_map_end;
1115               ts_language_field_map(self->language, node->production_id, &field_map, &field_map_end);
1116               for (; field_map != field_map_end; field_map++) {
1117                 if (!field_map->inherited && field_map->child_index == child_index) {
1118                   field_id = field_map->field_id;
1119                   break;
1120                 }
1121               }
1122             }
1123 
1124             // Create a new state that has advanced past this hypothetical subtree.
1125             AnalysisState next_state = *state;
1126             analysis_state__top(&next_state)->child_index++;
1127             analysis_state__top(&next_state)->parse_state = successor.state;
1128             if (node->done) analysis_state__top(&next_state)->done = true;
1129 
1130             // Determine if this hypothetical child node would match the current step
1131             // of the query pattern.
1132             bool does_match = false;
1133             if (visible_symbol) {
1134               does_match = true;
1135               if (step->symbol == NAMED_WILDCARD_SYMBOL) {
1136                 if (!self->language->symbol_metadata[visible_symbol].named) does_match = false;
1137               } else if (step->symbol != WILDCARD_SYMBOL) {
1138                 if (step->symbol != visible_symbol) does_match = false;
1139               }
1140               if (step->field && step->field != field_id) {
1141                 does_match = false;
1142               }
1143               if (
1144                 step->supertype_symbol &&
1145                 !analysis_state__has_supertype(state, step->supertype_symbol)
1146               ) does_match = false;
1147             }
1148 
1149             // If this is a hidden child, then push a new entry to the stack, in order to
1150             // walk through the children of this child.
1151             else if (sym >= self->language->token_count) {
1152               if (next_state.depth + 1 >= MAX_ANALYSIS_STATE_DEPTH) {
1153                 did_exceed_max_depth = true;
1154                 continue;
1155               }
1156 
1157               next_state.depth++;
1158               analysis_state__top(&next_state)->parse_state = parse_state;
1159               analysis_state__top(&next_state)->child_index = 0;
1160               analysis_state__top(&next_state)->parent_symbol = sym;
1161               analysis_state__top(&next_state)->field_id = field_id;
1162               analysis_state__top(&next_state)->done = false;
1163 
1164               if (analysis_state__recursion_depth(&next_state) > recursion_depth_limit) {
1165                 array_insert_sorted_with(&deeper_states, analysis_state__compare, next_state);
1166                 continue;
1167               }
1168             }
1169 
1170             // Pop from the stack when this state reached the end of its current syntax node.
1171             while (next_state.depth > 0 && analysis_state__top(&next_state)->done) {
1172               next_state.depth--;
1173             }
1174 
1175             // If this hypothetical child did match the current step of the query pattern,
1176             // then advance to the next step at the current depth. This involves skipping
1177             // over any descendant steps of the current child.
1178             const QueryStep *next_step = step;
1179             if (does_match) {
1180               for (;;) {
1181                 next_state.step_index++;
1182                 next_step = &self->steps.contents[next_state.step_index];
1183                 if (
1184                   next_step->depth == PATTERN_DONE_MARKER ||
1185                   next_step->depth <= parent_depth + 1
1186                 ) break;
1187               }
1188             } else if (next_parse_state == parse_state) {
1189               continue;
1190             }
1191 
1192             for (;;) {
1193               // Skip pass-through states. Although these states have alternatives, they are only
1194               // used to implement repetitions, and query analysis does not need to process
1195               // repetitions in order to determine whether steps are possible and definite.
1196               if (next_step->is_pass_through) {
1197                 next_state.step_index++;
1198                 next_step++;
1199                 continue;
1200               }
1201 
1202               // If the pattern is finished or hypothetical parent node is complete, then
1203               // record that matching can terminate at this step of the pattern. Otherwise,
1204               // add this state to the list of states to process on the next iteration.
1205               if (!next_step->is_dead_end) {
1206                 bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != parent_depth + 1;
1207                 if (did_finish_pattern) can_finish_pattern = true;
1208                 if (did_finish_pattern || next_state.depth == 0) {
1209                   array_insert_sorted_by(&final_step_indices, , next_state.step_index);
1210                 } else {
1211                   array_insert_sorted_with(&next_states, analysis_state__compare, next_state);
1212                 }
1213               }
1214 
1215               // If the state has advanced to a step with an alternative step, then add another state
1216               // at that alternative step. This process is simpler than the process of actually matching a
1217               // pattern during query exection, because for the purposes of query analysis, there is no
1218               // need to process repetitions.
1219               if (
1220                 does_match &&
1221                 next_step->alternative_index != NONE &&
1222                 next_step->alternative_index > next_state.step_index
1223               ) {
1224                 next_state.step_index = next_step->alternative_index;
1225                 next_step = &self->steps.contents[next_state.step_index];
1226               } else {
1227                 break;
1228               }
1229             }
1230           }
1231         }
1232       }
1233 
1234       AnalysisStateSet _states = states;
1235       states = next_states;
1236       next_states = _states;
1237     }
1238 
1239     // Mark as indefinite any step where a match terminated.
1240     // Later, this property will be propagated to all of the step's predecessors.
1241     for (unsigned j = 0; j < final_step_indices.size; j++) {
1242       uint32_t final_step_index = final_step_indices.contents[j];
1243       QueryStep *step = &self->steps.contents[final_step_index];
1244       if (
1245         step->depth != PATTERN_DONE_MARKER &&
1246         step->depth > parent_depth &&
1247         !step->is_dead_end
1248       ) {
1249         step->is_definite = false;
1250       }
1251     }
1252 
1253     if (did_exceed_max_depth) {
1254       for (unsigned j = parent_step_index + 1; j < self->steps.size; j++) {
1255         QueryStep *step = &self->steps.contents[j];
1256         if (
1257           step->depth <= parent_depth ||
1258           step->depth == PATTERN_DONE_MARKER
1259         ) break;
1260         if (!step->is_dead_end) {
1261           step->is_definite = false;
1262         }
1263       }
1264     }
1265 
1266     // If this pattern cannot match, store the pattern index so that it can be
1267     // returned to the caller.
1268     if (all_patterns_are_valid && !can_finish_pattern && !did_exceed_max_depth) {
1269       assert(final_step_indices.size > 0);
1270       uint16_t impossible_step_index = *array_back(&final_step_indices);
1271       uint32_t i, exists;
1272       array_search_sorted_by(&self->step_offsets, .step_index, impossible_step_index, &i, &exists);
1273       if (i >= self->step_offsets.size) i = self->step_offsets.size - 1;
1274       *error_offset = self->step_offsets.contents[i].byte_offset;
1275       all_patterns_are_valid = false;
1276       break;
1277     }
1278   }
1279 
1280   // Mark as indefinite any step with captures that are used in predicates.
1281   Array(uint16_t) predicate_capture_ids = array_new();
1282   for (unsigned i = 0; i < self->patterns.size; i++) {
1283     QueryPattern *pattern = &self->patterns.contents[i];
1284 
1285     // Gather all of the captures that are used in predicates for this pattern.
1286     array_clear(&predicate_capture_ids);
1287     for (
1288       unsigned start = pattern->predicate_steps.offset,
1289       end = start + pattern->predicate_steps.length,
1290       j = start; j < end; j++
1291     ) {
1292       TSQueryPredicateStep *step = &self->predicate_steps.contents[j];
1293       if (step->type == TSQueryPredicateStepTypeCapture) {
1294         array_insert_sorted_by(&predicate_capture_ids, , step->value_id);
1295       }
1296     }
1297 
1298     // Find all of the steps that have these captures.
1299     for (
1300       unsigned start = pattern->steps.offset,
1301       end = start + pattern->steps.length,
1302       j = start; j < end; j++
1303     ) {
1304       QueryStep *step = &self->steps.contents[j];
1305       for (unsigned k = 0; k < MAX_STEP_CAPTURE_COUNT; k++) {
1306         uint16_t capture_id = step->capture_ids[k];
1307         if (capture_id == NONE) break;
1308         unsigned index, exists;
1309         array_search_sorted_by(&predicate_capture_ids, , capture_id, &index, &exists);
1310         if (exists) {
1311           step->is_definite = false;
1312           break;
1313         }
1314       }
1315     }
1316   }
1317 
1318   // Propagate indefiniteness backwards.
1319   bool done = self->steps.size == 0;
1320   while (!done) {
1321     done = true;
1322     for (unsigned i = self->steps.size - 1; i > 0; i--) {
1323       QueryStep *step = &self->steps.contents[i];
1324 
1325       // Determine if this step is definite or has definite alternatives.
1326       bool is_definite = false;
1327       for (;;) {
1328         if (step->is_definite) {
1329           is_definite = true;
1330           break;
1331         }
1332         if (step->alternative_index == NONE || step->alternative_index < i) {
1333           break;
1334         }
1335         step = &self->steps.contents[step->alternative_index];
1336       }
1337 
1338       // If not, mark its predecessor as indefinite.
1339       if (!is_definite) {
1340         QueryStep *prev_step = &self->steps.contents[i - 1];
1341         if (
1342           !prev_step->is_dead_end &&
1343           prev_step->depth != PATTERN_DONE_MARKER &&
1344           prev_step->is_definite
1345         ) {
1346           prev_step->is_definite = false;
1347           done = false;
1348         }
1349       }
1350     }
1351   }
1352 
1353   #ifdef DEBUG_ANALYZE_QUERY
1354     printf("Steps:\n");
1355     for (unsigned i = 0; i < self->steps.size; i++) {
1356       QueryStep *step = &self->steps.contents[i];
1357       if (step->depth == PATTERN_DONE_MARKER) {
1358         printf("  %u: DONE\n", i);
1359       } else {
1360         printf(
1361           "  %u: {symbol: %s, field: %s, is_definite: %d}\n",
1362           i,
1363           (step->symbol == WILDCARD_SYMBOL || step->symbol == NAMED_WILDCARD_SYMBOL)
1364             ? "ANY"
1365             : ts_language_symbol_name(self->language, step->symbol),
1366           (step->field ? ts_language_field_name_for_id(self->language, step->field) : "-"),
1367           step->is_definite
1368         );
1369       }
1370     }
1371   #endif
1372 
1373   // Cleanup
1374   for (unsigned i = 0; i < subgraphs.size; i++) {
1375     array_delete(&subgraphs.contents[i].start_states);
1376     array_delete(&subgraphs.contents[i].nodes);
1377   }
1378   array_delete(&subgraphs);
1379   array_delete(&next_nodes);
1380   array_delete(&states);
1381   array_delete(&next_states);
1382   array_delete(&deeper_states);
1383   array_delete(&final_step_indices);
1384   array_delete(&parent_step_indices);
1385   array_delete(&predicate_capture_ids);
1386   state_predecessor_map_delete(&predecessor_map);
1387 
1388   return all_patterns_are_valid;
1389 }
1390 
ts_query__finalize_steps(TSQuery * self)1391 static void ts_query__finalize_steps(TSQuery *self) {
1392   for (unsigned i = 0; i < self->steps.size; i++) {
1393     QueryStep *step = &self->steps.contents[i];
1394     uint32_t depth = step->depth;
1395     if (step->capture_ids[0] != NONE) {
1396       step->contains_captures = true;
1397     } else {
1398       step->contains_captures = false;
1399       for (unsigned j = i + 1; j < self->steps.size; j++) {
1400         QueryStep *s = &self->steps.contents[j];
1401         if (s->depth == PATTERN_DONE_MARKER || s->depth <= depth) break;
1402         if (s->capture_ids[0] != NONE) step->contains_captures = true;
1403       }
1404     }
1405   }
1406 }
1407 
ts_query__add_negated_fields(TSQuery * self,uint16_t step_index,TSFieldId * field_ids,uint16_t field_count)1408 static void ts_query__add_negated_fields(
1409   TSQuery *self,
1410   uint16_t step_index,
1411   TSFieldId *field_ids,
1412   uint16_t field_count
1413 ) {
1414   QueryStep *step = &self->steps.contents[step_index];
1415 
1416   // The negated field array stores a list of field lists, separated by zeros.
1417   // Try to find the start index of an existing list that matches this new list.
1418   bool failed_match = false;
1419   unsigned match_count = 0;
1420   unsigned start_i = 0;
1421   for (unsigned i = 0; i < self->negated_fields.size; i++) {
1422     TSFieldId existing_field_id = self->negated_fields.contents[i];
1423 
1424     // At each zero value, terminate the match attempt. If we've exactly
1425     // matched the new field list, then reuse this index. Otherwise,
1426     // start over the matching process.
1427     if (existing_field_id == 0) {
1428       if (match_count == field_count) {
1429         step->negated_field_list_id = start_i;
1430         return;
1431       } else {
1432         start_i = i + 1;
1433         match_count = 0;
1434         failed_match = false;
1435       }
1436     }
1437 
1438     // If the existing list matches our new list so far, then advance
1439     // to the next element of the new list.
1440     else if (
1441       match_count < field_count &&
1442       existing_field_id == field_ids[match_count] &&
1443       !failed_match
1444     ) {
1445       match_count++;
1446     }
1447 
1448     // Otherwise, this existing list has failed to match.
1449     else {
1450       match_count = 0;
1451       failed_match = true;
1452     }
1453   }
1454 
1455   step->negated_field_list_id = self->negated_fields.size;
1456   array_extend(&self->negated_fields, field_count, field_ids);
1457   array_push(&self->negated_fields, 0);
1458 }
1459 
ts_query__parse_string_literal(TSQuery * self,Stream * stream)1460 static TSQueryError ts_query__parse_string_literal(
1461   TSQuery *self,
1462   Stream *stream
1463 ) {
1464   const char *string_start = stream->input;
1465   if (stream->next != '"') return TSQueryErrorSyntax;
1466   stream_advance(stream);
1467   const char *prev_position = stream->input;
1468 
1469   bool is_escaped = false;
1470   array_clear(&self->string_buffer);
1471   for (;;) {
1472     if (is_escaped) {
1473       is_escaped = false;
1474       switch (stream->next) {
1475         case 'n':
1476           array_push(&self->string_buffer, '\n');
1477           break;
1478         case 'r':
1479           array_push(&self->string_buffer, '\r');
1480           break;
1481         case 't':
1482           array_push(&self->string_buffer, '\t');
1483           break;
1484         case '0':
1485           array_push(&self->string_buffer, '\0');
1486           break;
1487         default:
1488           array_extend(&self->string_buffer, stream->next_size, stream->input);
1489           break;
1490       }
1491       prev_position = stream->input + stream->next_size;
1492     } else {
1493       if (stream->next == '\\') {
1494         array_extend(&self->string_buffer, (stream->input - prev_position), prev_position);
1495         prev_position = stream->input + 1;
1496         is_escaped = true;
1497       } else if (stream->next == '"') {
1498         array_extend(&self->string_buffer, (stream->input - prev_position), prev_position);
1499         stream_advance(stream);
1500         return TSQueryErrorNone;
1501       } else if (stream->next == '\n') {
1502         stream_reset(stream, string_start);
1503         return TSQueryErrorSyntax;
1504       }
1505     }
1506     if (!stream_advance(stream)) {
1507       stream_reset(stream, string_start);
1508       return TSQueryErrorSyntax;
1509     }
1510   }
1511 }
1512 
1513 // Parse a single predicate associated with a pattern, adding it to the
1514 // query's internal `predicate_steps` array. Predicates are arbitrary
1515 // S-expressions associated with a pattern which are meant to be handled at
1516 // a higher level of abstraction, such as the Rust/JavaScript bindings. They
1517 // can contain '@'-prefixed capture names, double-quoted strings, and bare
1518 // symbols, which also represent strings.
ts_query__parse_predicate(TSQuery * self,Stream * stream)1519 static TSQueryError ts_query__parse_predicate(
1520   TSQuery *self,
1521   Stream *stream
1522 ) {
1523   if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
1524   const char *predicate_name = stream->input;
1525   stream_scan_identifier(stream);
1526   uint32_t length = stream->input - predicate_name;
1527   uint16_t id = symbol_table_insert_name(
1528     &self->predicate_values,
1529     predicate_name,
1530     length
1531   );
1532   array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1533     .type = TSQueryPredicateStepTypeString,
1534     .value_id = id,
1535   }));
1536   stream_skip_whitespace(stream);
1537 
1538   for (;;) {
1539     if (stream->next == ')') {
1540       stream_advance(stream);
1541       stream_skip_whitespace(stream);
1542       array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1543         .type = TSQueryPredicateStepTypeDone,
1544         .value_id = 0,
1545       }));
1546       break;
1547     }
1548 
1549     // Parse an '@'-prefixed capture name
1550     else if (stream->next == '@') {
1551       stream_advance(stream);
1552 
1553       // Parse the capture name
1554       if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
1555       const char *capture_name = stream->input;
1556       stream_scan_identifier(stream);
1557       uint32_t length = stream->input - capture_name;
1558 
1559       // Add the capture id to the first step of the pattern
1560       int capture_id = symbol_table_id_for_name(
1561         &self->captures,
1562         capture_name,
1563         length
1564       );
1565       if (capture_id == -1) {
1566         stream_reset(stream, capture_name);
1567         return TSQueryErrorCapture;
1568       }
1569 
1570       array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1571         .type = TSQueryPredicateStepTypeCapture,
1572         .value_id = capture_id,
1573       }));
1574     }
1575 
1576     // Parse a string literal
1577     else if (stream->next == '"') {
1578       TSQueryError e = ts_query__parse_string_literal(self, stream);
1579       if (e) return e;
1580       uint16_t id = symbol_table_insert_name(
1581         &self->predicate_values,
1582         self->string_buffer.contents,
1583         self->string_buffer.size
1584       );
1585       array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1586         .type = TSQueryPredicateStepTypeString,
1587         .value_id = id,
1588       }));
1589     }
1590 
1591     // Parse a bare symbol
1592     else if (stream_is_ident_start(stream)) {
1593       const char *symbol_start = stream->input;
1594       stream_scan_identifier(stream);
1595       uint32_t length = stream->input - symbol_start;
1596       uint16_t id = symbol_table_insert_name(
1597         &self->predicate_values,
1598         symbol_start,
1599         length
1600       );
1601       array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1602         .type = TSQueryPredicateStepTypeString,
1603         .value_id = id,
1604       }));
1605     }
1606 
1607     else {
1608       return TSQueryErrorSyntax;
1609     }
1610 
1611     stream_skip_whitespace(stream);
1612   }
1613 
1614   return 0;
1615 }
1616 
1617 // Read one S-expression pattern from the stream, and incorporate it into
1618 // the query's internal state machine representation. For nested patterns,
1619 // this function calls itself recursively.
ts_query__parse_pattern(TSQuery * self,Stream * stream,uint32_t depth,bool is_immediate)1620 static TSQueryError ts_query__parse_pattern(
1621   TSQuery *self,
1622   Stream *stream,
1623   uint32_t depth,
1624   bool is_immediate
1625 ) {
1626   if (stream->next == 0) return TSQueryErrorSyntax;
1627   if (stream->next == ')' || stream->next == ']') return PARENT_DONE;
1628 
1629   const uint32_t starting_step_index = self->steps.size;
1630 
1631   // Store the byte offset of each step in the query.
1632   if (
1633     self->step_offsets.size == 0 ||
1634     array_back(&self->step_offsets)->step_index != starting_step_index
1635   ) {
1636     array_push(&self->step_offsets, ((StepOffset) {
1637       .step_index = starting_step_index,
1638       .byte_offset = stream_offset(stream),
1639     }));
1640   }
1641 
1642   // An open bracket is the start of an alternation.
1643   if (stream->next == '[') {
1644     stream_advance(stream);
1645     stream_skip_whitespace(stream);
1646 
1647     // Parse each branch, and add a placeholder step in between the branches.
1648     Array(uint32_t) branch_step_indices = array_new();
1649     for (;;) {
1650       uint32_t start_index = self->steps.size;
1651       TSQueryError e = ts_query__parse_pattern(
1652         self,
1653         stream,
1654         depth,
1655         is_immediate
1656       );
1657 
1658       if (e == PARENT_DONE && stream->next == ']' && branch_step_indices.size > 0) {
1659         stream_advance(stream);
1660         break;
1661       } else if (e) {
1662         if (e == PARENT_DONE) e = TSQueryErrorSyntax;
1663         array_delete(&branch_step_indices);
1664         return e;
1665       }
1666 
1667       array_push(&branch_step_indices, start_index);
1668       array_push(&self->steps, query_step__new(0, depth, false));
1669     }
1670     (void)array_pop(&self->steps);
1671 
1672     // For all of the branches except for the last one, add the subsequent branch as an
1673     // alternative, and link the end of the branch to the current end of the steps.
1674     for (unsigned i = 0; i < branch_step_indices.size - 1; i++) {
1675       uint32_t step_index = branch_step_indices.contents[i];
1676       uint32_t next_step_index = branch_step_indices.contents[i + 1];
1677       QueryStep *start_step = &self->steps.contents[step_index];
1678       QueryStep *end_step = &self->steps.contents[next_step_index - 1];
1679       start_step->alternative_index = next_step_index;
1680       end_step->alternative_index = self->steps.size;
1681       end_step->is_dead_end = true;
1682     }
1683 
1684     array_delete(&branch_step_indices);
1685   }
1686 
1687   // An open parenthesis can be the start of three possible constructs:
1688   // * A grouped sequence
1689   // * A predicate
1690   // * A named node
1691   else if (stream->next == '(') {
1692     stream_advance(stream);
1693     stream_skip_whitespace(stream);
1694 
1695     // If this parenthesis is followed by a node, then it represents a grouped sequence.
1696     if (stream->next == '(' || stream->next == '"' || stream->next == '[') {
1697       bool child_is_immediate = false;
1698       for (;;) {
1699         if (stream->next == '.') {
1700           child_is_immediate = true;
1701           stream_advance(stream);
1702           stream_skip_whitespace(stream);
1703         }
1704         TSQueryError e = ts_query__parse_pattern(
1705           self,
1706           stream,
1707           depth,
1708           child_is_immediate
1709         );
1710         if (e == PARENT_DONE && stream->next == ')') {
1711           stream_advance(stream);
1712           break;
1713         } else if (e) {
1714           return e;
1715         }
1716 
1717         child_is_immediate = false;
1718       }
1719     }
1720 
1721     // A dot/pound character indicates the start of a predicate.
1722     else if (stream->next == '.' || stream->next == '#') {
1723       stream_advance(stream);
1724       return ts_query__parse_predicate(self, stream);
1725     }
1726 
1727     // Otherwise, this parenthesis is the start of a named node.
1728     else {
1729       TSSymbol symbol;
1730 
1731       // TODO - remove.
1732       // For temporary backward compatibility, handle '*' as a wildcard.
1733       if (stream->next == '*') {
1734         symbol = depth > 0 ? NAMED_WILDCARD_SYMBOL : WILDCARD_SYMBOL;
1735         stream_advance(stream);
1736       }
1737 
1738       // Parse a normal node name
1739       else if (stream_is_ident_start(stream)) {
1740         const char *node_name = stream->input;
1741         stream_scan_identifier(stream);
1742         uint32_t length = stream->input - node_name;
1743 
1744         // TODO - remove.
1745         // For temporary backward compatibility, handle predicates without the leading '#' sign.
1746         if (length > 0 && (node_name[length - 1] == '!' || node_name[length - 1] == '?')) {
1747           stream_reset(stream, node_name);
1748           return ts_query__parse_predicate(self, stream);
1749         }
1750 
1751         // Parse the wildcard symbol
1752         else if (length == 1 && node_name[0] == '_') {
1753           symbol = depth > 0 ? NAMED_WILDCARD_SYMBOL : WILDCARD_SYMBOL;
1754         }
1755 
1756         else {
1757           symbol = ts_language_symbol_for_name(
1758             self->language,
1759             node_name,
1760             length,
1761             true
1762           );
1763           if (!symbol) {
1764             stream_reset(stream, node_name);
1765             return TSQueryErrorNodeType;
1766           }
1767         }
1768       } else {
1769         return TSQueryErrorSyntax;
1770       }
1771 
1772       // Add a step for the node.
1773       array_push(&self->steps, query_step__new(symbol, depth, is_immediate));
1774       if (ts_language_symbol_metadata(self->language, symbol).supertype) {
1775         QueryStep *step = array_back(&self->steps);
1776         step->supertype_symbol = step->symbol;
1777         step->symbol = NAMED_WILDCARD_SYMBOL;
1778       }
1779 
1780       stream_skip_whitespace(stream);
1781 
1782       if (stream->next == '/') {
1783         stream_advance(stream);
1784         if (!stream_is_ident_start(stream)) {
1785           return TSQueryErrorSyntax;
1786         }
1787 
1788         const char *node_name = stream->input;
1789         stream_scan_identifier(stream);
1790         uint32_t length = stream->input - node_name;
1791 
1792         QueryStep *step = array_back(&self->steps);
1793         step->symbol = ts_language_symbol_for_name(
1794           self->language,
1795           node_name,
1796           length,
1797           true
1798         );
1799         if (!step->symbol) {
1800           stream_reset(stream, node_name);
1801           return TSQueryErrorNodeType;
1802         }
1803 
1804         stream_skip_whitespace(stream);
1805       }
1806 
1807       // Parse the child patterns
1808       bool child_is_immediate = false;
1809       uint16_t last_child_step_index = 0;
1810       uint16_t negated_field_count = 0;
1811       TSFieldId negated_field_ids[MAX_NEGATED_FIELD_COUNT];
1812       for (;;) {
1813         // Parse a negated field assertion
1814         if (stream->next == '!') {
1815           stream_advance(stream);
1816           stream_skip_whitespace(stream);
1817           if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
1818           const char *field_name = stream->input;
1819           stream_scan_identifier(stream);
1820           uint32_t length = stream->input - field_name;
1821           stream_skip_whitespace(stream);
1822 
1823           TSFieldId field_id = ts_language_field_id_for_name(
1824             self->language,
1825             field_name,
1826             length
1827           );
1828           if (!field_id) {
1829             stream->input = field_name;
1830             return TSQueryErrorField;
1831           }
1832 
1833           // Keep the field ids sorted.
1834           if (negated_field_count < MAX_NEGATED_FIELD_COUNT) {
1835             negated_field_ids[negated_field_count] = field_id;
1836             negated_field_count++;
1837           }
1838 
1839           continue;
1840         }
1841 
1842         // Parse a sibling anchor
1843         if (stream->next == '.') {
1844           child_is_immediate = true;
1845           stream_advance(stream);
1846           stream_skip_whitespace(stream);
1847         }
1848 
1849         uint16_t step_index = self->steps.size;
1850         TSQueryError e = ts_query__parse_pattern(
1851           self,
1852           stream,
1853           depth + 1,
1854           child_is_immediate
1855         );
1856         if (e == PARENT_DONE && stream->next == ')') {
1857           if (child_is_immediate) {
1858             if (last_child_step_index == 0) {
1859               return TSQueryErrorSyntax;
1860             }
1861             self->steps.contents[last_child_step_index].is_last_child = true;
1862           }
1863 
1864           if (negated_field_count) {
1865             ts_query__add_negated_fields(
1866               self,
1867               starting_step_index,
1868               negated_field_ids,
1869               negated_field_count
1870             );
1871           }
1872 
1873           stream_advance(stream);
1874           break;
1875         } else if (e) {
1876           return e;
1877         }
1878 
1879         last_child_step_index = step_index;
1880         child_is_immediate = false;
1881       }
1882     }
1883   }
1884 
1885   // Parse a wildcard pattern
1886   else if (
1887     stream->next == '_' ||
1888 
1889     // TODO remove.
1890     // For temporary backward compatibility, handle '*' as a wildcard.
1891     stream->next == '*'
1892   ) {
1893     stream_advance(stream);
1894     stream_skip_whitespace(stream);
1895 
1896     // Add a step that matches any kind of node
1897     array_push(&self->steps, query_step__new(WILDCARD_SYMBOL, depth, is_immediate));
1898   }
1899 
1900   // Parse a double-quoted anonymous leaf node expression
1901   else if (stream->next == '"') {
1902     const char *string_start = stream->input;
1903     TSQueryError e = ts_query__parse_string_literal(self, stream);
1904     if (e) return e;
1905 
1906     // Add a step for the node
1907     TSSymbol symbol = ts_language_symbol_for_name(
1908       self->language,
1909       self->string_buffer.contents,
1910       self->string_buffer.size,
1911       false
1912     );
1913     if (!symbol) {
1914       stream_reset(stream, string_start + 1);
1915       return TSQueryErrorNodeType;
1916     }
1917     array_push(&self->steps, query_step__new(symbol, depth, is_immediate));
1918   }
1919 
1920   // Parse a field-prefixed pattern
1921   else if (stream_is_ident_start(stream)) {
1922     // Parse the field name
1923     const char *field_name = stream->input;
1924     stream_scan_identifier(stream);
1925     uint32_t length = stream->input - field_name;
1926     stream_skip_whitespace(stream);
1927 
1928     if (stream->next != ':') {
1929       stream_reset(stream, field_name);
1930       return TSQueryErrorSyntax;
1931     }
1932     stream_advance(stream);
1933     stream_skip_whitespace(stream);
1934 
1935     // Parse the pattern
1936     TSQueryError e = ts_query__parse_pattern(
1937       self,
1938       stream,
1939       depth,
1940       is_immediate
1941     );
1942     if (e == PARENT_DONE) return TSQueryErrorSyntax;
1943     if (e) return e;
1944 
1945     // Add the field name to the first step of the pattern
1946     TSFieldId field_id = ts_language_field_id_for_name(
1947       self->language,
1948       field_name,
1949       length
1950     );
1951     if (!field_id) {
1952       stream->input = field_name;
1953       return TSQueryErrorField;
1954     }
1955 
1956     uint32_t step_index = starting_step_index;
1957     QueryStep *step = &self->steps.contents[step_index];
1958     for (;;) {
1959       step->field = field_id;
1960       if (
1961         step->alternative_index != NONE &&
1962         step->alternative_index > step_index &&
1963         step->alternative_index < self->steps.size
1964       ) {
1965         step_index = step->alternative_index;
1966         step = &self->steps.contents[step_index];
1967       } else {
1968         break;
1969       }
1970     }
1971   }
1972 
1973   else {
1974     return TSQueryErrorSyntax;
1975   }
1976 
1977   stream_skip_whitespace(stream);
1978 
1979   // Parse suffixes modifiers for this pattern
1980   for (;;) {
1981     // Parse the one-or-more operator.
1982     if (stream->next == '+') {
1983       stream_advance(stream);
1984       stream_skip_whitespace(stream);
1985 
1986       QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false);
1987       repeat_step.alternative_index = starting_step_index;
1988       repeat_step.is_pass_through = true;
1989       repeat_step.alternative_is_immediate = true;
1990       array_push(&self->steps, repeat_step);
1991     }
1992 
1993     // Parse the zero-or-more repetition operator.
1994     else if (stream->next == '*') {
1995       stream_advance(stream);
1996       stream_skip_whitespace(stream);
1997 
1998       QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false);
1999       repeat_step.alternative_index = starting_step_index;
2000       repeat_step.is_pass_through = true;
2001       repeat_step.alternative_is_immediate = true;
2002       array_push(&self->steps, repeat_step);
2003 
2004       QueryStep *step = &self->steps.contents[starting_step_index];
2005       while (step->alternative_index != NONE) {
2006         step = &self->steps.contents[step->alternative_index];
2007       }
2008       step->alternative_index = self->steps.size;
2009     }
2010 
2011     // Parse the optional operator.
2012     else if (stream->next == '?') {
2013       stream_advance(stream);
2014       stream_skip_whitespace(stream);
2015 
2016       QueryStep *step = &self->steps.contents[starting_step_index];
2017       while (step->alternative_index != NONE) {
2018         step = &self->steps.contents[step->alternative_index];
2019       }
2020       step->alternative_index = self->steps.size;
2021     }
2022 
2023     // Parse an '@'-prefixed capture pattern
2024     else if (stream->next == '@') {
2025       stream_advance(stream);
2026       if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
2027       const char *capture_name = stream->input;
2028       stream_scan_identifier(stream);
2029       uint32_t length = stream->input - capture_name;
2030       stream_skip_whitespace(stream);
2031 
2032       // Add the capture id to the first step of the pattern
2033       uint16_t capture_id = symbol_table_insert_name(
2034         &self->captures,
2035         capture_name,
2036         length
2037       );
2038 
2039       uint32_t step_index = starting_step_index;
2040       for (;;) {
2041         QueryStep *step = &self->steps.contents[step_index];
2042         query_step__add_capture(step, capture_id);
2043         if (
2044           step->alternative_index != NONE &&
2045           step->alternative_index > step_index &&
2046           step->alternative_index < self->steps.size
2047         ) {
2048           step_index = step->alternative_index;
2049           step = &self->steps.contents[step_index];
2050         } else {
2051           break;
2052         }
2053       }
2054     }
2055 
2056     // No more suffix modifiers
2057     else {
2058       break;
2059     }
2060   }
2061 
2062   return 0;
2063 }
2064 
ts_query_new(const TSLanguage * language,const char * source,uint32_t source_len,uint32_t * error_offset,TSQueryError * error_type)2065 TSQuery *ts_query_new(
2066   const TSLanguage *language,
2067   const char *source,
2068   uint32_t source_len,
2069   uint32_t *error_offset,
2070   TSQueryError *error_type
2071 ) {
2072   if (
2073     !language ||
2074     language->version > TREE_SITTER_LANGUAGE_VERSION ||
2075     language->version < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION
2076   ) {
2077     *error_type = TSQueryErrorLanguage;
2078     return NULL;
2079   }
2080 
2081   TSQuery *self = ts_malloc(sizeof(TSQuery));
2082   *self = (TSQuery) {
2083     .steps = array_new(),
2084     .pattern_map = array_new(),
2085     .captures = symbol_table_new(),
2086     .predicate_values = symbol_table_new(),
2087     .predicate_steps = array_new(),
2088     .patterns = array_new(),
2089     .step_offsets = array_new(),
2090     .string_buffer = array_new(),
2091     .negated_fields = array_new(),
2092     .wildcard_root_pattern_count = 0,
2093     .language = language,
2094   };
2095 
2096   array_push(&self->negated_fields, 0);
2097 
2098   // Parse all of the S-expressions in the given string.
2099   Stream stream = stream_new(source, source_len);
2100   stream_skip_whitespace(&stream);
2101   while (stream.input < stream.end) {
2102     uint32_t pattern_index = self->patterns.size;
2103     uint32_t start_step_index = self->steps.size;
2104     uint32_t start_predicate_step_index = self->predicate_steps.size;
2105     array_push(&self->patterns, ((QueryPattern) {
2106       .steps = (Slice) {.offset = start_step_index},
2107       .predicate_steps = (Slice) {.offset = start_predicate_step_index},
2108       .start_byte = stream_offset(&stream),
2109     }));
2110     *error_type = ts_query__parse_pattern(self, &stream, 0, false);
2111     array_push(&self->steps, query_step__new(0, PATTERN_DONE_MARKER, false));
2112 
2113     QueryPattern *pattern = array_back(&self->patterns);
2114     pattern->steps.length = self->steps.size - start_step_index;
2115     pattern->predicate_steps.length = self->predicate_steps.size - start_predicate_step_index;
2116 
2117     // If any pattern could not be parsed, then report the error information
2118     // and terminate.
2119     if (*error_type) {
2120       if (*error_type == PARENT_DONE) *error_type = TSQueryErrorSyntax;
2121       *error_offset = stream_offset(&stream);
2122       ts_query_delete(self);
2123       return NULL;
2124     }
2125 
2126     // Maintain a map that can look up patterns for a given root symbol.
2127     uint16_t wildcard_root_alternative_index = NONE;
2128     for (;;) {
2129       QueryStep *step = &self->steps.contents[start_step_index];
2130 
2131       // If a pattern has a wildcard at its root, but it has a non-wildcard child,
2132       // then optimize the matching process by skipping matching the wildcard.
2133       // Later, during the matching process, the query cursor will check that
2134       // there is a parent node, and capture it if necessary.
2135       if (step->symbol == WILDCARD_SYMBOL && step->depth == 0) {
2136         QueryStep *second_step = &self->steps.contents[start_step_index + 1];
2137         if (second_step->symbol != WILDCARD_SYMBOL && second_step->depth == 1) {
2138           wildcard_root_alternative_index = step->alternative_index;
2139           start_step_index += 1;
2140           step = second_step;
2141         }
2142       }
2143 
2144       // Determine whether the pattern has a single root node. This affects
2145       // decisions about whether or not to start matching the pattern when
2146       // a query cursor has a range restriction.
2147       bool is_rooted = true;
2148       uint32_t start_depth = step->depth;
2149       for (uint32_t step_index = start_step_index + 1; step_index < self->steps.size; step_index++) {
2150         QueryStep *step = &self->steps.contents[step_index];
2151         if (step->depth == start_depth) {
2152           is_rooted = false;
2153           break;
2154         }
2155       }
2156 
2157       ts_query__pattern_map_insert(self, step->symbol, (PatternEntry) {
2158         .step_index = start_step_index,
2159         .pattern_index = pattern_index,
2160         .is_rooted = is_rooted
2161       });
2162       if (step->symbol == WILDCARD_SYMBOL) {
2163         self->wildcard_root_pattern_count++;
2164       }
2165 
2166       // If there are alternatives or options at the root of the pattern,
2167       // then add multiple entries to the pattern map.
2168       if (step->alternative_index != NONE) {
2169         start_step_index = step->alternative_index;
2170         step->alternative_index = NONE;
2171       } else if (wildcard_root_alternative_index != NONE) {
2172         start_step_index = wildcard_root_alternative_index;
2173         wildcard_root_alternative_index = NONE;
2174       } else {
2175         break;
2176       }
2177     }
2178   }
2179 
2180   if (!ts_query__analyze_patterns(self, error_offset)) {
2181     *error_type = TSQueryErrorStructure;
2182     ts_query_delete(self);
2183     return NULL;
2184   }
2185 
2186   ts_query__finalize_steps(self);
2187   array_delete(&self->string_buffer);
2188   return self;
2189 }
2190 
ts_query_delete(TSQuery * self)2191 void ts_query_delete(TSQuery *self) {
2192   if (self) {
2193     array_delete(&self->steps);
2194     array_delete(&self->pattern_map);
2195     array_delete(&self->predicate_steps);
2196     array_delete(&self->patterns);
2197     array_delete(&self->step_offsets);
2198     array_delete(&self->string_buffer);
2199     array_delete(&self->negated_fields);
2200     symbol_table_delete(&self->captures);
2201     symbol_table_delete(&self->predicate_values);
2202     ts_free(self);
2203   }
2204 }
2205 
ts_query_pattern_count(const TSQuery * self)2206 uint32_t ts_query_pattern_count(const TSQuery *self) {
2207   return self->patterns.size;
2208 }
2209 
ts_query_capture_count(const TSQuery * self)2210 uint32_t ts_query_capture_count(const TSQuery *self) {
2211   return self->captures.slices.size;
2212 }
2213 
ts_query_string_count(const TSQuery * self)2214 uint32_t ts_query_string_count(const TSQuery *self) {
2215   return self->predicate_values.slices.size;
2216 }
2217 
ts_query_capture_name_for_id(const TSQuery * self,uint32_t index,uint32_t * length)2218 const char *ts_query_capture_name_for_id(
2219   const TSQuery *self,
2220   uint32_t index,
2221   uint32_t *length
2222 ) {
2223   return symbol_table_name_for_id(&self->captures, index, length);
2224 }
2225 
ts_query_string_value_for_id(const TSQuery * self,uint32_t index,uint32_t * length)2226 const char *ts_query_string_value_for_id(
2227   const TSQuery *self,
2228   uint32_t index,
2229   uint32_t *length
2230 ) {
2231   return symbol_table_name_for_id(&self->predicate_values, index, length);
2232 }
2233 
ts_query_predicates_for_pattern(const TSQuery * self,uint32_t pattern_index,uint32_t * step_count)2234 const TSQueryPredicateStep *ts_query_predicates_for_pattern(
2235   const TSQuery *self,
2236   uint32_t pattern_index,
2237   uint32_t *step_count
2238 ) {
2239   Slice slice = self->patterns.contents[pattern_index].predicate_steps;
2240   *step_count = slice.length;
2241   if (self->predicate_steps.contents == NULL) {
2242     return NULL;
2243   }
2244   return &self->predicate_steps.contents[slice.offset];
2245 }
2246 
ts_query_start_byte_for_pattern(const TSQuery * self,uint32_t pattern_index)2247 uint32_t ts_query_start_byte_for_pattern(
2248   const TSQuery *self,
2249   uint32_t pattern_index
2250 ) {
2251   return self->patterns.contents[pattern_index].start_byte;
2252 }
2253 
ts_query_step_is_definite(const TSQuery * self,uint32_t byte_offset)2254 bool ts_query_step_is_definite(
2255   const TSQuery *self,
2256   uint32_t byte_offset
2257 ) {
2258   uint32_t step_index = UINT32_MAX;
2259   for (unsigned i = 0; i < self->step_offsets.size; i++) {
2260     StepOffset *step_offset = &self->step_offsets.contents[i];
2261     if (step_offset->byte_offset > byte_offset) break;
2262     step_index = step_offset->step_index;
2263   }
2264   if (step_index < self->steps.size) {
2265     return self->steps.contents[step_index].is_definite;
2266   } else {
2267     return false;
2268   }
2269 }
2270 
ts_query_disable_capture(TSQuery * self,const char * name,uint32_t length)2271 void ts_query_disable_capture(
2272   TSQuery *self,
2273   const char *name,
2274   uint32_t length
2275 ) {
2276   // Remove capture information for any pattern step that previously
2277   // captured with the given name.
2278   int id = symbol_table_id_for_name(&self->captures, name, length);
2279   if (id != -1) {
2280     for (unsigned i = 0; i < self->steps.size; i++) {
2281       QueryStep *step = &self->steps.contents[i];
2282       query_step__remove_capture(step, id);
2283     }
2284     ts_query__finalize_steps(self);
2285   }
2286 }
2287 
ts_query_disable_pattern(TSQuery * self,uint32_t pattern_index)2288 void ts_query_disable_pattern(
2289   TSQuery *self,
2290   uint32_t pattern_index
2291 ) {
2292   // Remove the given pattern from the pattern map. Its steps will still
2293   // be in the `steps` array, but they will never be read.
2294   for (unsigned i = 0; i < self->pattern_map.size; i++) {
2295     PatternEntry *pattern = &self->pattern_map.contents[i];
2296     if (pattern->pattern_index == pattern_index) {
2297       array_erase(&self->pattern_map, i);
2298       i--;
2299     }
2300   }
2301 }
2302 
2303 /***************
2304  * QueryCursor
2305  ***************/
2306 
ts_query_cursor_new(void)2307 TSQueryCursor *ts_query_cursor_new(void) {
2308   TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor));
2309   *self = (TSQueryCursor) {
2310     .did_exceed_match_limit = false,
2311     .ascending = false,
2312     .halted = false,
2313     .states = array_new(),
2314     .finished_states = array_new(),
2315     .capture_list_pool = capture_list_pool_new(),
2316     .start_byte = 0,
2317     .end_byte = UINT32_MAX,
2318     .start_point = {0, 0},
2319     .end_point = POINT_MAX,
2320   };
2321   array_reserve(&self->states, 8);
2322   array_reserve(&self->finished_states, 8);
2323   return self;
2324 }
2325 
ts_query_cursor_delete(TSQueryCursor * self)2326 void ts_query_cursor_delete(TSQueryCursor *self) {
2327   array_delete(&self->states);
2328   array_delete(&self->finished_states);
2329   ts_tree_cursor_delete(&self->cursor);
2330   capture_list_pool_delete(&self->capture_list_pool);
2331   ts_free(self);
2332 }
2333 
ts_query_cursor_did_exceed_match_limit(const TSQueryCursor * self)2334 bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *self) {
2335   return self->did_exceed_match_limit;
2336 }
2337 
ts_query_cursor_match_limit(const TSQueryCursor * self)2338 uint32_t ts_query_cursor_match_limit(const TSQueryCursor *self) {
2339   return self->capture_list_pool.max_capture_list_count;
2340 }
2341 
ts_query_cursor_set_match_limit(TSQueryCursor * self,uint32_t limit)2342 void ts_query_cursor_set_match_limit(TSQueryCursor *self, uint32_t limit) {
2343   self->capture_list_pool.max_capture_list_count = limit;
2344 }
2345 
ts_query_cursor_exec(TSQueryCursor * self,const TSQuery * query,TSNode node)2346 void ts_query_cursor_exec(
2347   TSQueryCursor *self,
2348   const TSQuery *query,
2349   TSNode node
2350 ) {
2351   array_clear(&self->states);
2352   array_clear(&self->finished_states);
2353   ts_tree_cursor_reset(&self->cursor, node);
2354   capture_list_pool_reset(&self->capture_list_pool);
2355   self->next_state_id = 0;
2356   self->depth = 0;
2357   self->ascending = false;
2358   self->halted = false;
2359   self->query = query;
2360   self->did_exceed_match_limit = false;
2361 }
2362 
ts_query_cursor_set_byte_range(TSQueryCursor * self,uint32_t start_byte,uint32_t end_byte)2363 void ts_query_cursor_set_byte_range(
2364   TSQueryCursor *self,
2365   uint32_t start_byte,
2366   uint32_t end_byte
2367 ) {
2368   if (end_byte == 0) {
2369     end_byte = UINT32_MAX;
2370   }
2371   self->start_byte = start_byte;
2372   self->end_byte = end_byte;
2373 }
2374 
ts_query_cursor_set_point_range(TSQueryCursor * self,TSPoint start_point,TSPoint end_point)2375 void ts_query_cursor_set_point_range(
2376   TSQueryCursor *self,
2377   TSPoint start_point,
2378   TSPoint end_point
2379 ) {
2380   if (end_point.row == 0 && end_point.column == 0) {
2381     end_point = POINT_MAX;
2382   }
2383   self->start_point = start_point;
2384   self->end_point = end_point;
2385 }
2386 
2387 // Search through all of the in-progress states, and find the captured
2388 // node that occurs earliest in the document.
ts_query_cursor__first_in_progress_capture(TSQueryCursor * self,uint32_t * state_index,uint32_t * byte_offset,uint32_t * pattern_index,bool * is_definite)2389 static bool ts_query_cursor__first_in_progress_capture(
2390   TSQueryCursor *self,
2391   uint32_t *state_index,
2392   uint32_t *byte_offset,
2393   uint32_t *pattern_index,
2394   bool *is_definite
2395 ) {
2396   bool result = false;
2397   *state_index = UINT32_MAX;
2398   *byte_offset = UINT32_MAX;
2399   *pattern_index = UINT32_MAX;
2400   for (unsigned i = 0; i < self->states.size; i++) {
2401     QueryState *state = &self->states.contents[i];
2402     if (state->dead) continue;
2403 
2404     const CaptureList *captures = capture_list_pool_get(
2405       &self->capture_list_pool,
2406       state->capture_list_id
2407     );
2408     if (state->consumed_capture_count >= captures->size) {
2409       continue;
2410     }
2411 
2412     TSNode node = captures->contents[state->consumed_capture_count].node;
2413     if (
2414       ts_node_end_byte(node) <= self->start_byte ||
2415       point_lte(ts_node_end_point(node), self->start_point)
2416     ) {
2417       state->consumed_capture_count++;
2418       i--;
2419       continue;
2420     }
2421 
2422     uint32_t node_start_byte = ts_node_start_byte(node);
2423     if (
2424       !result ||
2425       node_start_byte < *byte_offset ||
2426       (node_start_byte == *byte_offset && state->pattern_index < *pattern_index)
2427     ) {
2428       QueryStep *step = &self->query->steps.contents[state->step_index];
2429       if (is_definite) {
2430         *is_definite = step->is_definite;
2431       } else if (step->is_definite) {
2432         continue;
2433       }
2434 
2435       result = true;
2436       *state_index = i;
2437       *byte_offset = node_start_byte;
2438       *pattern_index = state->pattern_index;
2439     }
2440   }
2441   return result;
2442 }
2443 
2444 // Determine which node is first in a depth-first traversal
ts_query_cursor__compare_nodes(TSNode left,TSNode right)2445 int ts_query_cursor__compare_nodes(TSNode left, TSNode right) {
2446   if (left.id != right.id) {
2447     uint32_t left_start = ts_node_start_byte(left);
2448     uint32_t right_start = ts_node_start_byte(right);
2449     if (left_start < right_start) return -1;
2450     if (left_start > right_start) return 1;
2451     uint32_t left_node_count = ts_node_end_byte(left);
2452     uint32_t right_node_count = ts_node_end_byte(right);
2453     if (left_node_count > right_node_count) return -1;
2454     if (left_node_count < right_node_count) return 1;
2455   }
2456   return 0;
2457 }
2458 
2459 // Determine if either state contains a superset of the other state's captures.
ts_query_cursor__compare_captures(TSQueryCursor * self,QueryState * left_state,QueryState * right_state,bool * left_contains_right,bool * right_contains_left)2460 void ts_query_cursor__compare_captures(
2461   TSQueryCursor *self,
2462   QueryState *left_state,
2463   QueryState *right_state,
2464   bool *left_contains_right,
2465   bool *right_contains_left
2466 ) {
2467   const CaptureList *left_captures = capture_list_pool_get(
2468     &self->capture_list_pool,
2469     left_state->capture_list_id
2470   );
2471   const CaptureList *right_captures = capture_list_pool_get(
2472     &self->capture_list_pool,
2473     right_state->capture_list_id
2474   );
2475   *left_contains_right = true;
2476   *right_contains_left = true;
2477   unsigned i = 0, j = 0;
2478   for (;;) {
2479     if (i < left_captures->size) {
2480       if (j < right_captures->size) {
2481         TSQueryCapture *left = &left_captures->contents[i];
2482         TSQueryCapture *right = &right_captures->contents[j];
2483         if (left->node.id == right->node.id && left->index == right->index) {
2484           i++;
2485           j++;
2486         } else {
2487           switch (ts_query_cursor__compare_nodes(left->node, right->node)) {
2488             case -1:
2489               *right_contains_left = false;
2490               i++;
2491               break;
2492             case 1:
2493               *left_contains_right = false;
2494               j++;
2495               break;
2496             default:
2497               *right_contains_left = false;
2498               *left_contains_right = false;
2499               i++;
2500               j++;
2501               break;
2502           }
2503         }
2504       } else {
2505         *right_contains_left = false;
2506         break;
2507       }
2508     } else {
2509       if (j < right_captures->size) {
2510         *left_contains_right = false;
2511       }
2512       break;
2513     }
2514   }
2515 }
2516 
ts_query_cursor__add_state(TSQueryCursor * self,const PatternEntry * pattern)2517 static void ts_query_cursor__add_state(
2518   TSQueryCursor *self,
2519   const PatternEntry *pattern
2520 ) {
2521   QueryStep *step = &self->query->steps.contents[pattern->step_index];
2522   uint32_t start_depth = self->depth - step->depth;
2523 
2524   // Keep the states array in ascending order of start_depth and pattern_index,
2525   // so that it can be processed more efficiently elsewhere. Usually, there is
2526   // no work to do here because of two facts:
2527   // * States with lower start_depth are naturally added first due to the
2528   //   order in which nodes are visited.
2529   // * Earlier patterns are naturally added first because of the ordering of the
2530   //   pattern_map data structure that's used to initiate matches.
2531   //
2532   // This loop is only needed in cases where two conditions hold:
2533   // * A pattern consists of more than one sibling node, so that its states
2534   //   remain in progress after exiting the node that started the match.
2535   // * The first node in the pattern matches against multiple nodes at the
2536   //   same depth.
2537   //
2538   // An example of this is the pattern '((comment)* (function))'. If multiple
2539   // `comment` nodes appear in a row, then we may initiate a new state for this
2540   // pattern while another state for the same pattern is already in progress.
2541   // If there are multiple patterns like this in a query, then this loop will
2542   // need to execute in order to keep the states ordered by pattern_index.
2543   uint32_t index = self->states.size;
2544   while (index > 0) {
2545     QueryState *prev_state = &self->states.contents[index - 1];
2546     if (prev_state->start_depth < start_depth) break;
2547     if (prev_state->start_depth == start_depth) {
2548       if (prev_state->pattern_index < pattern->pattern_index) break;
2549       if (prev_state->pattern_index == pattern->pattern_index) {
2550         // Avoid inserting an unnecessary duplicate state, which would be
2551         // immediately pruned by the longest-match criteria.
2552         if (prev_state->step_index == pattern->step_index) return;
2553       }
2554     }
2555     index--;
2556   }
2557 
2558   LOG(
2559     "  start state. pattern:%u, step:%u\n",
2560     pattern->pattern_index,
2561     pattern->step_index
2562   );
2563   array_insert(&self->states, index, ((QueryState) {
2564     .capture_list_id = NONE,
2565     .step_index = pattern->step_index,
2566     .pattern_index = pattern->pattern_index,
2567     .start_depth = start_depth,
2568     .consumed_capture_count = 0,
2569     .seeking_immediate_match = true,
2570     .has_in_progress_alternatives = false,
2571     .needs_parent = step->depth == 1,
2572     .dead = false,
2573   }));
2574 }
2575 
2576 // Acquire a capture list for this state. If there are no capture lists left in the
2577 // pool, this will steal the capture list from another existing state, and mark that
2578 // other state as 'dead'.
ts_query_cursor__prepare_to_capture(TSQueryCursor * self,QueryState * state,unsigned state_index_to_preserve)2579 static CaptureList *ts_query_cursor__prepare_to_capture(
2580   TSQueryCursor *self,
2581   QueryState *state,
2582   unsigned state_index_to_preserve
2583 ) {
2584   if (state->capture_list_id == NONE) {
2585     state->capture_list_id = capture_list_pool_acquire(&self->capture_list_pool);
2586 
2587     // If there are no capture lists left in the pool, then terminate whichever
2588     // state has captured the earliest node in the document, and steal its
2589     // capture list.
2590     if (state->capture_list_id == NONE) {
2591       self->did_exceed_match_limit = true;
2592       uint32_t state_index, byte_offset, pattern_index;
2593       if (
2594         ts_query_cursor__first_in_progress_capture(
2595           self,
2596           &state_index,
2597           &byte_offset,
2598           &pattern_index,
2599           NULL
2600         ) &&
2601         state_index != state_index_to_preserve
2602       ) {
2603         LOG(
2604           "  abandon state. index:%u, pattern:%u, offset:%u.\n",
2605           state_index, pattern_index, byte_offset
2606         );
2607         QueryState *other_state = &self->states.contents[state_index];
2608         state->capture_list_id = other_state->capture_list_id;
2609         other_state->capture_list_id = NONE;
2610         other_state->dead = true;
2611         CaptureList *list = capture_list_pool_get_mut(
2612           &self->capture_list_pool,
2613           state->capture_list_id
2614         );
2615         array_clear(list);
2616         return list;
2617       } else {
2618         LOG("  ran out of capture lists");
2619         return NULL;
2620       }
2621     }
2622   }
2623   return capture_list_pool_get_mut(&self->capture_list_pool, state->capture_list_id);
2624 }
2625 
ts_query_cursor__capture(TSQueryCursor * self,QueryState * state,QueryStep * step,TSNode node)2626 static void ts_query_cursor__capture(
2627   TSQueryCursor *self,
2628   QueryState *state,
2629   QueryStep *step,
2630   TSNode node
2631 ) {
2632   if (state->dead) return;
2633   CaptureList *capture_list = ts_query_cursor__prepare_to_capture(self, state, UINT32_MAX);
2634   if (!capture_list) {
2635     state->dead = true;
2636     return;
2637   }
2638 
2639   for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) {
2640     uint16_t capture_id = step->capture_ids[j];
2641     if (step->capture_ids[j] == NONE) break;
2642     array_push(capture_list, ((TSQueryCapture) { node, capture_id }));
2643     LOG(
2644       "  capture node. type:%s, pattern:%u, capture_id:%u, capture_count:%u\n",
2645       ts_node_type(node),
2646       state->pattern_index,
2647       capture_id,
2648       capture_list->size
2649     );
2650   }
2651 }
2652 
2653 // Duplicate the given state and insert the newly-created state immediately after
2654 // the given state in the `states` array. Ensures that the given state reference is
2655 // still valid, even if the states array is reallocated.
ts_query_cursor__copy_state(TSQueryCursor * self,QueryState ** state_ref)2656 static QueryState *ts_query_cursor__copy_state(
2657   TSQueryCursor *self,
2658   QueryState **state_ref
2659 ) {
2660   const QueryState *state = *state_ref;
2661   uint32_t state_index = state - self->states.contents;
2662   QueryState copy = *state;
2663   copy.capture_list_id = NONE;
2664 
2665   // If the state has captures, copy its capture list.
2666   if (state->capture_list_id != NONE) {
2667     CaptureList *new_captures = ts_query_cursor__prepare_to_capture(self, &copy, state_index);
2668     if (!new_captures) return NULL;
2669     const CaptureList *old_captures = capture_list_pool_get(
2670       &self->capture_list_pool,
2671       state->capture_list_id
2672     );
2673     array_push_all(new_captures, old_captures);
2674   }
2675 
2676   array_insert(&self->states, state_index + 1, copy);
2677   *state_ref = &self->states.contents[state_index];
2678   return &self->states.contents[state_index + 1];
2679 }
2680 
2681 // Walk the tree, processing patterns until at least one pattern finishes,
2682 // If one or more patterns finish, return `true` and store their states in the
2683 // `finished_states` array. Multiple patterns can finish on the same node. If
2684 // there are no more matches, return `false`.
ts_query_cursor__advance(TSQueryCursor * self,bool stop_on_definite_step)2685 static inline bool ts_query_cursor__advance(
2686   TSQueryCursor *self,
2687   bool stop_on_definite_step
2688 ) {
2689   bool did_match = false;
2690   for (;;) {
2691     if (self->halted) {
2692       while (self->states.size > 0) {
2693         QueryState state = array_pop(&self->states);
2694         capture_list_pool_release(
2695           &self->capture_list_pool,
2696           state.capture_list_id
2697         );
2698       }
2699     }
2700 
2701     if (did_match || self->halted) return did_match;
2702 
2703     // Exit the current node.
2704     if (self->ascending) {
2705       LOG("leave node. type:%s\n", ts_node_type(ts_tree_cursor_current_node(&self->cursor)));
2706 
2707       // Leave this node by stepping to its next sibling or to its parent.
2708       if (ts_tree_cursor_goto_next_sibling(&self->cursor)) {
2709         self->ascending = false;
2710       } else if (ts_tree_cursor_goto_parent(&self->cursor)) {
2711         self->depth--;
2712       } else {
2713         LOG("halt at root\n");
2714         self->halted = true;
2715       }
2716 
2717       // After leaving a node, remove any states that cannot make further progress.
2718       uint32_t deleted_count = 0;
2719       for (unsigned i = 0, n = self->states.size; i < n; i++) {
2720         QueryState *state = &self->states.contents[i];
2721         QueryStep *step = &self->query->steps.contents[state->step_index];
2722 
2723         // If a state completed its pattern inside of this node, but was deferred from finishing
2724         // in order to search for longer matches, mark it as finished.
2725         if (step->depth == PATTERN_DONE_MARKER) {
2726           if (state->start_depth > self->depth || self->halted) {
2727             LOG("  finish pattern %u\n", state->pattern_index);
2728             state->id = self->next_state_id++;
2729             array_push(&self->finished_states, *state);
2730             did_match = true;
2731             deleted_count++;
2732             continue;
2733           }
2734         }
2735 
2736         // If a state needed to match something within this node, then remove that state
2737         // as it has failed to match.
2738         else if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) {
2739           LOG(
2740             "  failed to match. pattern:%u, step:%u\n",
2741             state->pattern_index,
2742             state->step_index
2743           );
2744           capture_list_pool_release(
2745             &self->capture_list_pool,
2746             state->capture_list_id
2747           );
2748           deleted_count++;
2749           continue;
2750         }
2751 
2752         if (deleted_count > 0) {
2753           self->states.contents[i - deleted_count] = *state;
2754         }
2755       }
2756       self->states.size -= deleted_count;
2757     }
2758 
2759     // Enter a new node.
2760     else {
2761       // Get the properties of the current node.
2762       TSNode node = ts_tree_cursor_current_node(&self->cursor);
2763       TSNode parent_node = ts_tree_cursor_parent_node(&self->cursor);
2764       TSSymbol symbol = ts_node_symbol(node);
2765       bool is_named = ts_node_is_named(node);
2766       bool has_later_siblings;
2767       bool has_later_named_siblings;
2768       bool can_have_later_siblings_with_this_field;
2769       TSFieldId field_id = 0;
2770       TSSymbol supertypes[8] = {0};
2771       unsigned supertype_count = 8;
2772       ts_tree_cursor_current_status(
2773         &self->cursor,
2774         &field_id,
2775         &has_later_siblings,
2776         &has_later_named_siblings,
2777         &can_have_later_siblings_with_this_field,
2778         supertypes,
2779         &supertype_count
2780       );
2781       LOG(
2782         "enter node. type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u\n",
2783         ts_node_type(node),
2784         ts_language_field_name_for_id(self->query->language, field_id),
2785         ts_node_start_point(node).row,
2786         self->states.size,
2787         self->finished_states.size
2788       );
2789 
2790       bool node_intersects_range = (
2791         ts_node_end_byte(node) > self->start_byte &&
2792         ts_node_start_byte(node) < self->end_byte &&
2793         point_gt(ts_node_end_point(node), self->start_point) &&
2794         point_lt(ts_node_start_point(node), self->end_point)
2795       );
2796 
2797       bool parent_intersects_range = ts_node_is_null(parent_node) || (
2798         ts_node_end_byte(parent_node) > self->start_byte &&
2799         ts_node_start_byte(parent_node) < self->end_byte &&
2800         point_gt(ts_node_end_point(parent_node), self->start_point) &&
2801         point_lt(ts_node_start_point(parent_node), self->end_point)
2802       );
2803 
2804         // Add new states for any patterns whose root node is a wildcard.
2805       for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) {
2806         PatternEntry *pattern = &self->query->pattern_map.contents[i];
2807 
2808         // If this node matches the first step of the pattern, then add a new
2809         // state at the start of this pattern.
2810         QueryStep *step = &self->query->steps.contents[pattern->step_index];
2811         if (
2812           (node_intersects_range || (!pattern->is_rooted && parent_intersects_range)) &&
2813           (!step->field || field_id == step->field) &&
2814           (!step->supertype_symbol || supertype_count > 0)
2815         ) {
2816           ts_query_cursor__add_state(self, pattern);
2817         }
2818       }
2819 
2820       // Add new states for any patterns whose root node matches this node.
2821       unsigned i;
2822       if (ts_query__pattern_map_search(self->query, symbol, &i)) {
2823         PatternEntry *pattern = &self->query->pattern_map.contents[i];
2824 
2825         QueryStep *step = &self->query->steps.contents[pattern->step_index];
2826         do {
2827           // If this node matches the first step of the pattern, then add a new
2828           // state at the start of this pattern.
2829           if (
2830             (node_intersects_range || (!pattern->is_rooted && parent_intersects_range)) &&
2831             (!step->field || field_id == step->field)
2832           ) {
2833             ts_query_cursor__add_state(self, pattern);
2834           }
2835 
2836           // Advance to the next pattern whose root node matches this node.
2837           i++;
2838           if (i == self->query->pattern_map.size) break;
2839           pattern = &self->query->pattern_map.contents[i];
2840           step = &self->query->steps.contents[pattern->step_index];
2841         } while (step->symbol == symbol);
2842       }
2843 
2844       // Update all of the in-progress states with current node.
2845       for (unsigned i = 0, copy_count = 0; i < self->states.size; i += 1 + copy_count) {
2846         QueryState *state = &self->states.contents[i];
2847         QueryStep *step = &self->query->steps.contents[state->step_index];
2848         state->has_in_progress_alternatives = false;
2849         copy_count = 0;
2850 
2851         // Check that the node matches all of the criteria for the next
2852         // step of the pattern.
2853         if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue;
2854 
2855         // Determine if this node matches this step of the pattern, and also
2856         // if this node can have later siblings that match this step of the
2857         // pattern.
2858         bool node_does_match =
2859           step->symbol == symbol ||
2860           step->symbol == WILDCARD_SYMBOL ||
2861           (step->symbol == NAMED_WILDCARD_SYMBOL && is_named);
2862         bool later_sibling_can_match = has_later_siblings;
2863         if ((step->is_immediate && is_named) || state->seeking_immediate_match) {
2864           later_sibling_can_match = false;
2865         }
2866         if (step->is_last_child && has_later_named_siblings) {
2867           node_does_match = false;
2868         }
2869         if (step->supertype_symbol) {
2870           bool has_supertype = false;
2871           for (unsigned j = 0; j < supertype_count; j++) {
2872             if (supertypes[j] == step->supertype_symbol) {
2873               has_supertype = true;
2874               break;
2875             }
2876           }
2877           if (!has_supertype) node_does_match = false;
2878         }
2879         if (step->field) {
2880           if (step->field == field_id) {
2881             if (!can_have_later_siblings_with_this_field) {
2882               later_sibling_can_match = false;
2883             }
2884           } else {
2885             node_does_match = false;
2886           }
2887         }
2888 
2889         if (step->negated_field_list_id) {
2890           TSFieldId *negated_field_ids = &self->query->negated_fields.contents[step->negated_field_list_id];
2891           for (;;) {
2892             TSFieldId negated_field_id = *negated_field_ids;
2893             if (negated_field_id) {
2894               negated_field_ids++;
2895               if (ts_node_child_by_field_id(node, negated_field_id).id) {
2896                 node_does_match = false;
2897                 break;
2898               }
2899             } else {
2900               break;
2901             }
2902           }
2903         }
2904 
2905         // Remove states immediately if it is ever clear that they cannot match.
2906         if (!node_does_match) {
2907           if (!later_sibling_can_match) {
2908             LOG(
2909               "  discard state. pattern:%u, step:%u\n",
2910               state->pattern_index,
2911               state->step_index
2912             );
2913             capture_list_pool_release(
2914               &self->capture_list_pool,
2915               state->capture_list_id
2916             );
2917             array_erase(&self->states, i);
2918             i--;
2919           }
2920           continue;
2921         }
2922 
2923         // Some patterns can match their root node in multiple ways, capturing different
2924         // children. If this pattern step could match later children within the same
2925         // parent, then this query state cannot simply be updated in place. It must be
2926         // split into two states: one that matches this node, and one which skips over
2927         // this node, to preserve the possibility of matching later siblings.
2928         if (later_sibling_can_match && (step->contains_captures || !step->is_definite)) {
2929           if (ts_query_cursor__copy_state(self, &state)) {
2930             LOG(
2931               "  split state for capture. pattern:%u, step:%u\n",
2932               state->pattern_index,
2933               state->step_index
2934             );
2935             copy_count++;
2936           }
2937         }
2938 
2939         // If this pattern started with a wildcard, such that the pattern map
2940         // actually points to the *second* step of the pattern, then check
2941         // that the node has a parent, and capture the parent node if necessary.
2942         if (state->needs_parent) {
2943           TSNode parent = ts_tree_cursor_parent_node(&self->cursor);
2944           if (ts_node_is_null(parent)) {
2945             LOG("  missing parent node\n");
2946             state->dead = true;
2947           } else {
2948             state->needs_parent = false;
2949             QueryStep *skipped_wildcard_step = step;
2950             do {
2951               skipped_wildcard_step--;
2952             } while (
2953               skipped_wildcard_step->is_dead_end ||
2954               skipped_wildcard_step->is_pass_through ||
2955               skipped_wildcard_step->depth > 0
2956             );
2957             if (skipped_wildcard_step->capture_ids[0] != NONE) {
2958               LOG("  capture wildcard parent\n");
2959               ts_query_cursor__capture(
2960                 self,
2961                 state,
2962                 skipped_wildcard_step,
2963                 parent
2964               );
2965             }
2966           }
2967         }
2968 
2969         // If the current node is captured in this pattern, add it to the capture list.
2970         if (step->capture_ids[0] != NONE) {
2971           ts_query_cursor__capture(self, state, step, node);
2972         }
2973 
2974         if (state->dead) {
2975           array_erase(&self->states, i);
2976           i--;
2977           continue;
2978         }
2979 
2980         // Advance this state to the next step of its pattern.
2981         state->step_index++;
2982         state->seeking_immediate_match = false;
2983         LOG(
2984           "  advance state. pattern:%u, step:%u\n",
2985           state->pattern_index,
2986           state->step_index
2987         );
2988 
2989         QueryStep *next_step = &self->query->steps.contents[state->step_index];
2990         if (stop_on_definite_step && next_step->is_definite) did_match = true;
2991 
2992         // If this state's next step has an alternative step, then copy the state in order
2993         // to pursue both alternatives. The alternative step itself may have an alternative,
2994         // so this is an interative process.
2995         unsigned end_index = i + 1;
2996         for (unsigned j = i; j < end_index; j++) {
2997           QueryState *state = &self->states.contents[j];
2998           QueryStep *next_step = &self->query->steps.contents[state->step_index];
2999           if (next_step->alternative_index != NONE) {
3000             // A "dead-end" step exists only to add a non-sequential jump into the step sequence,
3001             // via its alternative index. When a state reaches a dead-end step, it jumps straight
3002             // to the step's alternative.
3003             if (next_step->is_dead_end) {
3004               state->step_index = next_step->alternative_index;
3005               j--;
3006               continue;
3007             }
3008 
3009             // A "pass-through" step exists only to add a branch into the step sequence,
3010             // via its alternative_index. When a state reaches a pass-through step, it splits
3011             // in order to process the alternative step, and then it advances to the next step.
3012             if (next_step->is_pass_through) {
3013               state->step_index++;
3014               j--;
3015             }
3016 
3017             QueryState *copy = ts_query_cursor__copy_state(self, &state);
3018             if (copy) {
3019               LOG(
3020                 "  split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n",
3021                 copy->pattern_index,
3022                 copy->step_index,
3023                 next_step->alternative_index,
3024                 next_step->alternative_is_immediate,
3025                 capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size
3026               );
3027               end_index++;
3028               copy_count++;
3029               copy->step_index = next_step->alternative_index;
3030               if (next_step->alternative_is_immediate) {
3031                 copy->seeking_immediate_match = true;
3032               }
3033             }
3034           }
3035         }
3036       }
3037 
3038       for (unsigned i = 0; i < self->states.size; i++) {
3039         QueryState *state = &self->states.contents[i];
3040         if (state->dead) {
3041           array_erase(&self->states, i);
3042           i--;
3043           continue;
3044         }
3045 
3046         // Enfore the longest-match criteria. When a query pattern contains optional or
3047         // repeated nodes, this is necessary to avoid multiple redundant states, where
3048         // one state has a strict subset of another state's captures.
3049         bool did_remove = false;
3050         for (unsigned j = i + 1; j < self->states.size; j++) {
3051           QueryState *other_state = &self->states.contents[j];
3052 
3053           // Query states are kept in ascending order of start_depth and pattern_index.
3054           // Since the longest-match criteria is only used for deduping matches of the same
3055           // pattern and root node, we only need to perform pairwise comparisons within a
3056           // small slice of the states array.
3057           if (
3058             other_state->start_depth != state->start_depth ||
3059             other_state->pattern_index != state->pattern_index
3060           ) break;
3061 
3062           bool left_contains_right, right_contains_left;
3063           ts_query_cursor__compare_captures(
3064             self,
3065             state,
3066             other_state,
3067             &left_contains_right,
3068             &right_contains_left
3069           );
3070           if (left_contains_right) {
3071             if (state->step_index == other_state->step_index) {
3072               LOG(
3073                 "  drop shorter state. pattern: %u, step_index: %u\n",
3074                 state->pattern_index,
3075                 state->step_index
3076               );
3077               capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id);
3078               array_erase(&self->states, j);
3079               j--;
3080               continue;
3081             }
3082             other_state->has_in_progress_alternatives = true;
3083           }
3084           if (right_contains_left) {
3085             if (state->step_index == other_state->step_index) {
3086               LOG(
3087                 "  drop shorter state. pattern: %u, step_index: %u\n",
3088                 state->pattern_index,
3089                 state->step_index
3090               );
3091               capture_list_pool_release(&self->capture_list_pool, state->capture_list_id);
3092               array_erase(&self->states, i);
3093               i--;
3094               did_remove = true;
3095               break;
3096             }
3097             state->has_in_progress_alternatives = true;
3098           }
3099         }
3100 
3101         // If there the state is at the end of its pattern, remove it from the list
3102         // of in-progress states and add it to the list of finished states.
3103         if (!did_remove) {
3104           LOG(
3105             "  keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n",
3106             state->pattern_index,
3107             state->start_depth,
3108             state->step_index,
3109             capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size
3110           );
3111           QueryStep *next_step = &self->query->steps.contents[state->step_index];
3112           if (next_step->depth == PATTERN_DONE_MARKER) {
3113             if (state->has_in_progress_alternatives) {
3114               LOG("  defer finishing pattern %u\n", state->pattern_index);
3115             } else {
3116               LOG("  finish pattern %u\n", state->pattern_index);
3117               state->id = self->next_state_id++;
3118               array_push(&self->finished_states, *state);
3119               array_erase(&self->states, state - self->states.contents);
3120               did_match = true;
3121               i--;
3122             }
3123           }
3124         }
3125       }
3126 
3127       // When the current node ends prior to the desired start offset,
3128       // only descend for the purpose of continuing in-progress matches.
3129       bool should_descend = node_intersects_range;
3130       if (!should_descend) {
3131         for (unsigned i = 0; i < self->states.size; i++) {
3132           QueryState *state = &self->states.contents[i];;
3133           QueryStep *next_step = &self->query->steps.contents[state->step_index];
3134           if (
3135             next_step->depth != PATTERN_DONE_MARKER &&
3136             state->start_depth + next_step->depth > self->depth
3137           ) {
3138             should_descend = true;
3139             break;
3140           }
3141         }
3142       }
3143 
3144       if (!should_descend) {
3145         LOG(
3146           "  not descending. node end byte: %u, start byte: %u\n",
3147           ts_node_end_byte(node),
3148           self->start_byte
3149         );
3150       }
3151 
3152       if (should_descend && ts_tree_cursor_goto_first_child(&self->cursor)) {
3153         self->depth++;
3154       } else {
3155         self->ascending = true;
3156       }
3157     }
3158   }
3159 }
3160 
ts_query_cursor_next_match(TSQueryCursor * self,TSQueryMatch * match)3161 bool ts_query_cursor_next_match(
3162   TSQueryCursor *self,
3163   TSQueryMatch *match
3164 ) {
3165   if (self->finished_states.size == 0) {
3166     if (!ts_query_cursor__advance(self, false)) {
3167       return false;
3168     }
3169   }
3170 
3171   QueryState *state = &self->finished_states.contents[0];
3172   match->id = state->id;
3173   match->pattern_index = state->pattern_index;
3174   const CaptureList *captures = capture_list_pool_get(
3175     &self->capture_list_pool,
3176     state->capture_list_id
3177   );
3178   match->captures = captures->contents;
3179   match->capture_count = captures->size;
3180   capture_list_pool_release(&self->capture_list_pool, state->capture_list_id);
3181   array_erase(&self->finished_states, 0);
3182   return true;
3183 }
3184 
ts_query_cursor_remove_match(TSQueryCursor * self,uint32_t match_id)3185 void ts_query_cursor_remove_match(
3186   TSQueryCursor *self,
3187   uint32_t match_id
3188 ) {
3189   for (unsigned i = 0; i < self->finished_states.size; i++) {
3190     const QueryState *state = &self->finished_states.contents[i];
3191     if (state->id == match_id) {
3192       capture_list_pool_release(
3193         &self->capture_list_pool,
3194         state->capture_list_id
3195       );
3196       array_erase(&self->finished_states, i);
3197       return;
3198     }
3199   }
3200 }
3201 
ts_query_cursor_next_capture(TSQueryCursor * self,TSQueryMatch * match,uint32_t * capture_index)3202 bool ts_query_cursor_next_capture(
3203   TSQueryCursor *self,
3204   TSQueryMatch *match,
3205   uint32_t *capture_index
3206 ) {
3207   // The goal here is to return captures in order, even though they may not
3208   // be discovered in order, because patterns can overlap. Search for matches
3209   // until there is a finished capture that is before any unfinished capture.
3210   for (;;) {
3211     // First, find the earliest capture in an unfinished match.
3212     uint32_t first_unfinished_capture_byte;
3213     uint32_t first_unfinished_pattern_index;
3214     uint32_t first_unfinished_state_index;
3215     bool first_unfinished_state_is_definite = false;
3216     ts_query_cursor__first_in_progress_capture(
3217       self,
3218       &first_unfinished_state_index,
3219       &first_unfinished_capture_byte,
3220       &first_unfinished_pattern_index,
3221       &first_unfinished_state_is_definite
3222     );
3223 
3224     // Then find the earliest capture in a finished match. It must occur
3225     // before the first capture in an *unfinished* match.
3226     QueryState *first_finished_state = NULL;
3227     uint32_t first_finished_capture_byte = first_unfinished_capture_byte;
3228     uint32_t first_finished_pattern_index = first_unfinished_pattern_index;
3229     for (unsigned i = 0; i < self->finished_states.size;) {
3230       QueryState *state = &self->finished_states.contents[i];
3231       const CaptureList *captures = capture_list_pool_get(
3232         &self->capture_list_pool,
3233         state->capture_list_id
3234       );
3235 
3236       // Remove states whose captures are all consumed.
3237       if (state->consumed_capture_count >= captures->size) {
3238         capture_list_pool_release(
3239           &self->capture_list_pool,
3240           state->capture_list_id
3241         );
3242         array_erase(&self->finished_states, i);
3243         continue;
3244       }
3245 
3246       // Skip captures that precede the cursor's start byte.
3247       TSNode node = captures->contents[state->consumed_capture_count].node;
3248       if (ts_node_end_byte(node) <= self->start_byte) {
3249         state->consumed_capture_count++;
3250         continue;
3251       }
3252 
3253       uint32_t node_start_byte = ts_node_start_byte(node);
3254       if (
3255         node_start_byte < first_finished_capture_byte ||
3256         (
3257           node_start_byte == first_finished_capture_byte &&
3258           state->pattern_index < first_finished_pattern_index
3259         )
3260       ) {
3261         first_finished_state = state;
3262         first_finished_capture_byte = node_start_byte;
3263         first_finished_pattern_index = state->pattern_index;
3264       }
3265       i++;
3266     }
3267 
3268     // If there is finished capture that is clearly before any unfinished
3269     // capture, then return its match, and its capture index. Internally
3270     // record the fact that the capture has been 'consumed'.
3271     QueryState *state;
3272     if (first_finished_state) {
3273       state = first_finished_state;
3274     } else if (first_unfinished_state_is_definite) {
3275       state = &self->states.contents[first_unfinished_state_index];
3276     } else {
3277       state = NULL;
3278     }
3279 
3280     if (state) {
3281       match->id = state->id;
3282       match->pattern_index = state->pattern_index;
3283       const CaptureList *captures = capture_list_pool_get(
3284         &self->capture_list_pool,
3285         state->capture_list_id
3286       );
3287       match->captures = captures->contents;
3288       match->capture_count = captures->size;
3289       *capture_index = state->consumed_capture_count;
3290       state->consumed_capture_count++;
3291       return true;
3292     }
3293 
3294     if (capture_list_pool_is_empty(&self->capture_list_pool)) {
3295       LOG(
3296         "  abandon state. index:%u, pattern:%u, offset:%u.\n",
3297         first_unfinished_state_index,
3298         first_unfinished_pattern_index,
3299         first_unfinished_capture_byte
3300       );
3301       capture_list_pool_release(
3302         &self->capture_list_pool,
3303         self->states.contents[first_unfinished_state_index].capture_list_id
3304       );
3305       array_erase(&self->states, first_unfinished_state_index);
3306     }
3307 
3308     // If there are no finished matches that are ready to be returned, then
3309     // continue finding more matches.
3310     if (
3311       !ts_query_cursor__advance(self, true) &&
3312       self->finished_states.size == 0
3313     ) return false;
3314   }
3315 }
3316 
3317 #undef LOG
3318