1 #include "tree_sitter/api.h"
2 #include "./alloc.h"
3 #include "./array.h"
4 #include "./language.h"
5 #include "./point.h"
6 #include "./tree_cursor.h"
7 #include "./unicode.h"
8 #include <wctype.h>
9 
10 // #define DEBUG_ANALYZE_QUERY
11 // #define DEBUG_EXECUTE_QUERY
12 
13 #define MAX_STEP_CAPTURE_COUNT 3
14 #define MAX_STATE_PREDECESSOR_COUNT 100
15 #define MAX_ANALYSIS_STATE_DEPTH 8
16 #define MAX_NEGATED_FIELD_COUNT 8
17 
18 /*
19  * Stream - A sequence of unicode characters derived from a UTF8 string.
20  * This struct is used in parsing queries from S-expressions.
21  */
22 typedef struct {
23   const char *input;
24   const char *start;
25   const char *end;
26   int32_t next;
27   uint8_t next_size;
28 } Stream;
29 
30 /*
31  * QueryStep - A step in the process of matching a query. Each node within
32  * a query S-expression corresponds to one of these steps. An entire pattern
33  * is represented as a sequence of these steps. The basic properties of a
34  * node are represented by these fields:
35  * - `symbol` - The grammar symbol to match. A zero value represents the
36  *    wildcard symbol, '_'.
37  * - `field` - The field name to match. A zero value means that a field name
38  *    was not specified.
39  * - `capture_ids` - An array of integers representing the names of captures
40  *    associated with this node in the pattern, terminated by a `NONE` value.
41  * - `depth` - The depth where this node occurs in the pattern. The root node
42  *    of the pattern has depth zero.
43  * - `negated_field_list_id` - An id representing a set of fields that must
44  *    that must not be present on a node matching this step.
45  *
46  * Steps have some additional fields in order to handle the `.` (or "anchor") operator,
47  * which forbids additional child nodes:
48  * - `is_immediate` - Indicates that the node matching this step cannot be preceded
49  *   by other sibling nodes that weren't specified in the pattern.
50  * - `is_last_child` - Indicates that the node matching this step cannot have any
51  *   subsequent named siblings.
52  *
53  * For simple patterns, steps are matched in sequential order. But in order to
54  * handle alternative/repeated/optional sub-patterns, query steps are not always
55  * structured as a linear sequence; they sometimes need to split and merge. This
56  * is done using the following fields:
57  *  - `alternative_index` - The index of a different query step that serves as
58  *    an alternative to this step. A `NONE` value represents no alternative.
59  *    When a query state reaches a step with an alternative index, the state
60  *    is duplicated, with one copy remaining at the original step, and one copy
61  *    moving to the alternative step. The alternative may have its own alternative
62  *    step, so this splitting is an iterative process.
63  * - `is_dead_end` - Indicates that this state cannot be passed directly, and
64  *    exists only in order to redirect to an alternative index, with no splitting.
65  * - `is_pass_through` - Indicates that state has no matching logic of its own,
66  *    and exists only to split a state. One copy of the state advances immediately
67  *    to the next step, and one moves to the alternative step.
68  * - `alternative_is_immediate` - Indicates that this step's alternative step
69  *    should be treated as if `is_immediate` is true.
70  *
71  * Steps also store some derived state that summarizes how they relate to other
72  * steps within the same pattern. This is used to optimize the matching process:
73  *  - `contains_captures` - Indicates that this step or one of its child steps
74  *     has a non-empty `capture_ids` list.
75  *  - `parent_pattern_guaranteed` - Indicates that if this step is reached, then
76  *     it and all of its subsequent sibling steps within the same parent pattern
77  *     are guaranteed to match.
78  *  - `root_pattern_guaranteed` - Similar to `parent_pattern_guaranteed`, but
79  *     for the entire top-level pattern. When iterating through a query's
80  *     captures using `ts_query_cursor_next_capture`, this field is used to
81  *     detect that a capture can safely be returned from a match that has not
82  *     even completed  yet.
83  */
84 typedef struct {
85   TSSymbol symbol;
86   TSSymbol supertype_symbol;
87   TSFieldId field;
88   uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT];
89   uint16_t depth;
90   uint16_t alternative_index;
91   uint16_t negated_field_list_id;
92   bool is_named: 1;
93   bool is_immediate: 1;
94   bool is_last_child: 1;
95   bool is_pass_through: 1;
96   bool is_dead_end: 1;
97   bool alternative_is_immediate: 1;
98   bool contains_captures: 1;
99   bool root_pattern_guaranteed: 1;
100   bool parent_pattern_guaranteed: 1;
101 } QueryStep;
102 
103 /*
104  * Slice - A slice of an external array. Within a query, capture names,
105  * literal string values, and predicate step informations are stored in three
106  * contiguous arrays. Individual captures, string values, and predicates are
107  * represented as slices of these three arrays.
108  */
109 typedef struct {
110   uint32_t offset;
111   uint32_t length;
112 } Slice;
113 
114 /*
115  * SymbolTable - a two-way mapping of strings to ids.
116  */
117 typedef struct {
118   Array(char) characters;
119   Array(Slice) slices;
120 } SymbolTable;
121 
122 /*
123  * PatternEntry - Information about the starting point for matching a particular
124  * pattern. These entries are stored in a 'pattern map' - a sorted array that
125  * makes it possible to efficiently lookup patterns based on the symbol for their
126  * first step. The entry consists of the following fields:
127  * - `pattern_index` - the index of the pattern within the query
128  * - `step_index` - the index of the pattern's first step in the shared `steps` array
129  * - `is_rooted` - whether or not the pattern has a single root node. This property
130  *   affects decisions about whether or not to start the pattern for nodes outside
131  *   of a QueryCursor's range restriction.
132  */
133 typedef struct {
134   uint16_t step_index;
135   uint16_t pattern_index;
136   bool is_rooted;
137 } PatternEntry;
138 
139 typedef struct {
140   Slice steps;
141   Slice predicate_steps;
142   uint32_t start_byte;
143 } QueryPattern;
144 
145 typedef struct {
146   uint32_t byte_offset;
147   uint16_t step_index;
148 } StepOffset;
149 
150 /*
151  * QueryState - The state of an in-progress match of a particular pattern
152  * in a query. While executing, a `TSQueryCursor` must keep track of a number
153  * of possible in-progress matches. Each of those possible matches is
154  * represented as one of these states. Fields:
155  * - `id` - A numeric id that is exposed to the public API. This allows the
156  *    caller to remove a given match, preventing any more of its captures
157  *    from being returned.
158  * - `start_depth` - The depth in the tree where the first step of the state's
159  *    pattern was matched.
160  * - `pattern_index` - The pattern that the state is matching.
161  * - `consumed_capture_count` - The number of captures from this match that
162  *    have already been returned.
163  * - `capture_list_id` - A numeric id that can be used to retrieve the state's
164  *    list of captures from the `CaptureListPool`.
165  * - `seeking_immediate_match` - A flag that indicates that the state's next
166  *    step must be matched by the very next sibling. This is used when
167  *    processing repetitions.
168  * - `has_in_progress_alternatives` - A flag that indicates that there is are
169  *    other states that have the same captures as this state, but are at
170  *    different steps in their pattern. This means that in order to obey the
171  *    'longest-match' rule, this state should not be returned as a match until
172  *    it is clear that there can be no other alternative match with more captures.
173  */
174 typedef struct {
175   uint32_t id;
176   uint32_t capture_list_id;
177   uint16_t start_depth;
178   uint16_t step_index;
179   uint16_t pattern_index;
180   uint16_t consumed_capture_count: 12;
181   bool seeking_immediate_match: 1;
182   bool has_in_progress_alternatives: 1;
183   bool dead: 1;
184   bool needs_parent: 1;
185 } QueryState;
186 
187 typedef Array(TSQueryCapture) CaptureList;
188 
189 /*
190  * CaptureListPool - A collection of *lists* of captures. Each query state needs
191  * to maintain its own list of captures. To avoid repeated allocations, this struct
192  * maintains a fixed set of capture lists, and keeps track of which ones are
193  * currently in use by a query state.
194  */
195 typedef struct {
196   Array(CaptureList) list;
197   CaptureList empty_list;
198   // The maximum number of capture lists that we are allowed to allocate. We
199   // never allow `list` to allocate more entries than this, dropping pending
200   // matches if needed to stay under the limit.
201   uint32_t max_capture_list_count;
202   // The number of capture lists allocated in `list` that are not currently in
203   // use. We reuse those existing-but-unused capture lists before trying to
204   // allocate any new ones. We use an invalid value (UINT32_MAX) for a capture
205   // list's length to indicate that it's not in use.
206   uint32_t free_capture_list_count;
207 } CaptureListPool;
208 
209 /*
210  * AnalysisState - The state needed for walking the parse table when analyzing
211  * a query pattern, to determine at which steps the pattern might fail to match.
212  */
213 typedef struct {
214   TSStateId parse_state;
215   TSSymbol parent_symbol;
216   uint16_t child_index;
217   TSFieldId field_id: 15;
218   bool done: 1;
219 } AnalysisStateEntry;
220 
221 typedef struct {
222   AnalysisStateEntry stack[MAX_ANALYSIS_STATE_DEPTH];
223   uint16_t depth;
224   uint16_t step_index;
225 } AnalysisState;
226 
227 typedef Array(AnalysisState) AnalysisStateSet;
228 
229 /*
230  * AnalysisSubgraph - A subset of the states in the parse table that are used
231  * in constructing nodes with a certain symbol. Each state is accompanied by
232  * some information about the possible node that could be produced in
233  * downstream states.
234  */
235 typedef struct {
236   TSStateId state;
237   uint8_t production_id;
238   uint8_t child_index: 7;
239   bool done: 1;
240 } AnalysisSubgraphNode;
241 
242 typedef struct {
243   TSSymbol symbol;
244   Array(TSStateId) start_states;
245   Array(AnalysisSubgraphNode) nodes;
246 } AnalysisSubgraph;
247 
248 /*
249  * StatePredecessorMap - A map that stores the predecessors of each parse state.
250  * This is used during query analysis to determine which parse states can lead
251  * to which reduce actions.
252  */
253 typedef struct {
254   TSStateId *contents;
255 } StatePredecessorMap;
256 
257 /*
258  * TSQuery - A tree query, compiled from a string of S-expressions. The query
259  * itself is immutable. The mutable state used in the process of executing the
260  * query is stored in a `TSQueryCursor`.
261  */
262 struct TSQuery {
263   SymbolTable captures;
264   SymbolTable predicate_values;
265   Array(QueryStep) steps;
266   Array(PatternEntry) pattern_map;
267   Array(TSQueryPredicateStep) predicate_steps;
268   Array(QueryPattern) patterns;
269   Array(StepOffset) step_offsets;
270   Array(TSFieldId) negated_fields;
271   Array(char) string_buffer;
272   const TSLanguage *language;
273   uint16_t wildcard_root_pattern_count;
274 };
275 
276 /*
277  * TSQueryCursor - A stateful struct used to execute a query on a tree.
278  */
279 struct TSQueryCursor {
280   const TSQuery *query;
281   TSTreeCursor cursor;
282   Array(QueryState) states;
283   Array(QueryState) finished_states;
284   CaptureListPool capture_list_pool;
285   uint32_t depth;
286   uint32_t start_byte;
287   uint32_t end_byte;
288   TSPoint start_point;
289   TSPoint end_point;
290   uint32_t next_state_id;
291   bool ascending;
292   bool halted;
293   bool did_exceed_match_limit;
294 };
295 
296 static const TSQueryError PARENT_DONE = -1;
297 static const uint16_t PATTERN_DONE_MARKER = UINT16_MAX;
298 static const uint16_t NONE = UINT16_MAX;
299 static const TSSymbol WILDCARD_SYMBOL = 0;
300 
301 /**********
302  * Stream
303  **********/
304 
305 // Advance to the next unicode code point in the stream.
stream_advance(Stream * self)306 static bool stream_advance(Stream *self) {
307   self->input += self->next_size;
308   if (self->input < self->end) {
309     uint32_t size = ts_decode_utf8(
310       (const uint8_t *)self->input,
311       self->end - self->input,
312       &self->next
313     );
314     if (size > 0) {
315       self->next_size = size;
316       return true;
317     }
318   } else {
319     self->next_size = 0;
320     self->next = '\0';
321   }
322   return false;
323 }
324 
325 // Reset the stream to the given input position, represented as a pointer
326 // into the input string.
stream_reset(Stream * self,const char * input)327 static void stream_reset(Stream *self, const char *input) {
328   self->input = input;
329   self->next_size = 0;
330   stream_advance(self);
331 }
332 
stream_new(const char * string,uint32_t length)333 static Stream stream_new(const char *string, uint32_t length) {
334   Stream self = {
335     .next = 0,
336     .input = string,
337     .start = string,
338     .end = string + length,
339   };
340   stream_advance(&self);
341   return self;
342 }
343 
stream_skip_whitespace(Stream * self)344 static void stream_skip_whitespace(Stream *self) {
345   for (;;) {
346     if (iswspace(self->next)) {
347       stream_advance(self);
348     } else if (self->next == ';') {
349       // skip over comments
350       stream_advance(self);
351       while (self->next && self->next != '\n') {
352         if (!stream_advance(self)) break;
353       }
354     } else {
355       break;
356     }
357   }
358 }
359 
stream_is_ident_start(Stream * self)360 static bool stream_is_ident_start(Stream *self) {
361   return iswalnum(self->next) || self->next == '_' || self->next == '-';
362 }
363 
stream_scan_identifier(Stream * stream)364 static void stream_scan_identifier(Stream *stream) {
365   do {
366     stream_advance(stream);
367   } while (
368     iswalnum(stream->next) ||
369     stream->next == '_' ||
370     stream->next == '-' ||
371     stream->next == '.' ||
372     stream->next == '?' ||
373     stream->next == '!'
374   );
375 }
376 
stream_offset(Stream * self)377 static uint32_t stream_offset(Stream *self) {
378   return self->input - self->start;
379 }
380 
381 /******************
382  * CaptureListPool
383  ******************/
384 
capture_list_pool_new(void)385 static CaptureListPool capture_list_pool_new(void) {
386   return (CaptureListPool) {
387     .list = array_new(),
388     .empty_list = array_new(),
389     .max_capture_list_count = UINT32_MAX,
390     .free_capture_list_count = 0,
391   };
392 }
393 
capture_list_pool_reset(CaptureListPool * self)394 static void capture_list_pool_reset(CaptureListPool *self) {
395   for (uint16_t i = 0; i < self->list.size; i++) {
396     // This invalid size means that the list is not in use.
397     self->list.contents[i].size = UINT32_MAX;
398   }
399   self->free_capture_list_count = self->list.size;
400 }
401 
capture_list_pool_delete(CaptureListPool * self)402 static void capture_list_pool_delete(CaptureListPool *self) {
403   for (uint16_t i = 0; i < self->list.size; i++) {
404     array_delete(&self->list.contents[i]);
405   }
406   array_delete(&self->list);
407 }
408 
capture_list_pool_get(const CaptureListPool * self,uint16_t id)409 static const CaptureList *capture_list_pool_get(const CaptureListPool *self, uint16_t id) {
410   if (id >= self->list.size) return &self->empty_list;
411   return &self->list.contents[id];
412 }
413 
capture_list_pool_get_mut(CaptureListPool * self,uint16_t id)414 static CaptureList *capture_list_pool_get_mut(CaptureListPool *self, uint16_t id) {
415   assert(id < self->list.size);
416   return &self->list.contents[id];
417 }
418 
capture_list_pool_is_empty(const CaptureListPool * self)419 static bool capture_list_pool_is_empty(const CaptureListPool *self) {
420   // The capture list pool is empty if all allocated lists are in use, and we
421   // have reached the maximum allowed number of allocated lists.
422   return self->free_capture_list_count == 0 && self->list.size >= self->max_capture_list_count;
423 }
424 
capture_list_pool_acquire(CaptureListPool * self)425 static uint16_t capture_list_pool_acquire(CaptureListPool *self) {
426   // First see if any already allocated capture list is currently unused.
427   if (self->free_capture_list_count > 0) {
428     for (uint16_t i = 0; i < self->list.size; i++) {
429       if (self->list.contents[i].size == UINT32_MAX) {
430         array_clear(&self->list.contents[i]);
431         self->free_capture_list_count--;
432         return i;
433       }
434     }
435   }
436 
437   // Otherwise allocate and initialize a new capture list, as long as that
438   // doesn't put us over the requested maximum.
439   uint32_t i = self->list.size;
440   if (i >= self->max_capture_list_count) {
441     return NONE;
442   }
443   CaptureList list;
444   array_init(&list);
445   array_push(&self->list, list);
446   return i;
447 }
448 
capture_list_pool_release(CaptureListPool * self,uint16_t id)449 static void capture_list_pool_release(CaptureListPool *self, uint16_t id) {
450   if (id >= self->list.size) return;
451   self->list.contents[id].size = UINT32_MAX;
452   self->free_capture_list_count++;
453 }
454 
455 /**************
456  * SymbolTable
457  **************/
458 
symbol_table_new(void)459 static SymbolTable symbol_table_new(void) {
460   return (SymbolTable) {
461     .characters = array_new(),
462     .slices = array_new(),
463   };
464 }
465 
symbol_table_delete(SymbolTable * self)466 static void symbol_table_delete(SymbolTable *self) {
467   array_delete(&self->characters);
468   array_delete(&self->slices);
469 }
470 
symbol_table_id_for_name(const SymbolTable * self,const char * name,uint32_t length)471 static int symbol_table_id_for_name(
472   const SymbolTable *self,
473   const char *name,
474   uint32_t length
475 ) {
476   for (unsigned i = 0; i < self->slices.size; i++) {
477     Slice slice = self->slices.contents[i];
478     if (
479       slice.length == length &&
480       !strncmp(&self->characters.contents[slice.offset], name, length)
481     ) return i;
482   }
483   return -1;
484 }
485 
symbol_table_name_for_id(const SymbolTable * self,uint16_t id,uint32_t * length)486 static const char *symbol_table_name_for_id(
487   const SymbolTable *self,
488   uint16_t id,
489   uint32_t *length
490 ) {
491   Slice slice = self->slices.contents[id];
492   *length = slice.length;
493   return &self->characters.contents[slice.offset];
494 }
495 
symbol_table_insert_name(SymbolTable * self,const char * name,uint32_t length)496 static uint16_t symbol_table_insert_name(
497   SymbolTable *self,
498   const char *name,
499   uint32_t length
500 ) {
501   int id = symbol_table_id_for_name(self, name, length);
502   if (id >= 0) return (uint16_t)id;
503   Slice slice = {
504     .offset = self->characters.size,
505     .length = length,
506   };
507   array_grow_by(&self->characters, length + 1);
508   memcpy(&self->characters.contents[slice.offset], name, length);
509   self->characters.contents[self->characters.size - 1] = 0;
510   array_push(&self->slices, slice);
511   return self->slices.size - 1;
512 }
513 
514 /************
515  * QueryStep
516  ************/
517 
query_step__new(TSSymbol symbol,uint16_t depth,bool is_immediate)518 static QueryStep query_step__new(
519   TSSymbol symbol,
520   uint16_t depth,
521   bool is_immediate
522 ) {
523   return (QueryStep) {
524     .symbol = symbol,
525     .depth = depth,
526     .field = 0,
527     .capture_ids = {NONE, NONE, NONE},
528     .alternative_index = NONE,
529     .negated_field_list_id = 0,
530     .contains_captures = false,
531     .is_last_child = false,
532     .is_named = false,
533     .is_pass_through = false,
534     .is_dead_end = false,
535     .root_pattern_guaranteed = false,
536     .is_immediate = is_immediate,
537     .alternative_is_immediate = false,
538   };
539 }
540 
query_step__add_capture(QueryStep * self,uint16_t capture_id)541 static void query_step__add_capture(QueryStep *self, uint16_t capture_id) {
542   for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) {
543     if (self->capture_ids[i] == NONE) {
544       self->capture_ids[i] = capture_id;
545       break;
546     }
547   }
548 }
549 
query_step__remove_capture(QueryStep * self,uint16_t capture_id)550 static void query_step__remove_capture(QueryStep *self, uint16_t capture_id) {
551   for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) {
552     if (self->capture_ids[i] == capture_id) {
553       self->capture_ids[i] = NONE;
554       while (i + 1 < MAX_STEP_CAPTURE_COUNT) {
555         if (self->capture_ids[i + 1] == NONE) break;
556         self->capture_ids[i] = self->capture_ids[i + 1];
557         self->capture_ids[i + 1] = NONE;
558         i++;
559       }
560       break;
561     }
562   }
563 }
564 
565 /**********************
566  * StatePredecessorMap
567  **********************/
568 
state_predecessor_map_new(const TSLanguage * language)569 static inline StatePredecessorMap state_predecessor_map_new(
570   const TSLanguage *language
571 ) {
572   return (StatePredecessorMap) {
573     .contents = ts_calloc(
574       language->state_count * (MAX_STATE_PREDECESSOR_COUNT + 1),
575       sizeof(TSStateId)
576     ),
577   };
578 }
579 
state_predecessor_map_delete(StatePredecessorMap * self)580 static inline void state_predecessor_map_delete(StatePredecessorMap *self) {
581   ts_free(self->contents);
582 }
583 
state_predecessor_map_add(StatePredecessorMap * self,TSStateId state,TSStateId predecessor)584 static inline void state_predecessor_map_add(
585   StatePredecessorMap *self,
586   TSStateId state,
587   TSStateId predecessor
588 ) {
589   unsigned index = state * (MAX_STATE_PREDECESSOR_COUNT + 1);
590   TSStateId *count = &self->contents[index];
591   if (
592     *count == 0 ||
593     (*count < MAX_STATE_PREDECESSOR_COUNT && self->contents[index + *count] != predecessor)
594   ) {
595     (*count)++;
596     self->contents[index + *count] = predecessor;
597   }
598 }
599 
state_predecessor_map_get(const StatePredecessorMap * self,TSStateId state,unsigned * count)600 static inline const TSStateId *state_predecessor_map_get(
601   const StatePredecessorMap *self,
602   TSStateId state,
603   unsigned *count
604 ) {
605   unsigned index = state * (MAX_STATE_PREDECESSOR_COUNT + 1);
606   *count = self->contents[index];
607   return &self->contents[index + 1];
608 }
609 
610 /****************
611  * AnalysisState
612  ****************/
613 
analysis_state__recursion_depth(const AnalysisState * self)614 static unsigned analysis_state__recursion_depth(const AnalysisState *self) {
615   unsigned result = 0;
616   for (unsigned i = 0; i < self->depth; i++) {
617     TSSymbol symbol = self->stack[i].parent_symbol;
618     for (unsigned j = 0; j < i; j++) {
619       if (self->stack[j].parent_symbol == symbol) {
620         result++;
621         break;
622       }
623     }
624   }
625   return result;
626 }
627 
analysis_state__compare_position(const AnalysisState * self,const AnalysisState * other)628 static inline int analysis_state__compare_position(
629   const AnalysisState *self,
630   const AnalysisState *other
631 ) {
632   for (unsigned i = 0; i < self->depth; i++) {
633     if (i >= other->depth) return -1;
634     if (self->stack[i].child_index < other->stack[i].child_index) return -1;
635     if (self->stack[i].child_index > other->stack[i].child_index) return 1;
636   }
637   if (self->depth < other->depth) return 1;
638   return 0;
639 }
640 
analysis_state__compare(const AnalysisState * self,const AnalysisState * other)641 static inline int analysis_state__compare(
642   const AnalysisState *self,
643   const AnalysisState *other
644 ) {
645   int result = analysis_state__compare_position(self, other);
646   if (result != 0) return result;
647   for (unsigned i = 0; i < self->depth; i++) {
648     if (self->stack[i].parent_symbol < other->stack[i].parent_symbol) return -1;
649     if (self->stack[i].parent_symbol > other->stack[i].parent_symbol) return 1;
650     if (self->stack[i].parse_state < other->stack[i].parse_state) return -1;
651     if (self->stack[i].parse_state > other->stack[i].parse_state) return 1;
652     if (self->stack[i].field_id < other->stack[i].field_id) return -1;
653     if (self->stack[i].field_id > other->stack[i].field_id) return 1;
654   }
655   if (self->step_index < other->step_index) return -1;
656   if (self->step_index > other->step_index) return 1;
657   return 0;
658 }
659 
analysis_state__top(AnalysisState * self)660 static inline AnalysisStateEntry *analysis_state__top(AnalysisState *self) {
661   return &self->stack[self->depth - 1];
662 }
663 
analysis_state__has_supertype(AnalysisState * self,TSSymbol symbol)664 static inline bool analysis_state__has_supertype(AnalysisState *self, TSSymbol symbol) {
665   for (unsigned i = 0; i < self->depth; i++) {
666     if (self->stack[i].parent_symbol == symbol) return true;
667   }
668   return false;
669 }
670 
671 /***********************
672  * AnalysisSubgraphNode
673  ***********************/
674 
analysis_subgraph_node__compare(const AnalysisSubgraphNode * self,const AnalysisSubgraphNode * other)675 static inline int analysis_subgraph_node__compare(const AnalysisSubgraphNode *self, const AnalysisSubgraphNode *other) {
676   if (self->state < other->state) return -1;
677   if (self->state > other->state) return 1;
678   if (self->child_index < other->child_index) return -1;
679   if (self->child_index > other->child_index) return 1;
680   if (self->done < other->done) return -1;
681   if (self->done > other->done) return 1;
682   if (self->production_id < other->production_id) return -1;
683   if (self->production_id > other->production_id) return 1;
684   return 0;
685 }
686 
687 /*********
688  * Query
689  *********/
690 
691 // The `pattern_map` contains a mapping from TSSymbol values to indices in the
692 // `steps` array. For a given syntax node, the `pattern_map` makes it possible
693 // to quickly find the starting steps of all of the patterns whose root matches
694 // that node. Each entry has two fields: a `pattern_index`, which identifies one
695 // of the patterns in the query, and a `step_index`, which indicates the start
696 // offset of that pattern's steps within the `steps` array.
697 //
698 // The entries are sorted by the patterns' root symbols, and lookups use a
699 // binary search. This ensures that the cost of this initial lookup step
700 // scales logarithmically with the number of patterns in the query.
701 //
702 // This returns `true` if the symbol is present and `false` otherwise.
703 // If the symbol is not present `*result` is set to the index where the
704 // symbol should be inserted.
ts_query__pattern_map_search(const TSQuery * self,TSSymbol needle,uint32_t * result)705 static inline bool ts_query__pattern_map_search(
706   const TSQuery *self,
707   TSSymbol needle,
708   uint32_t *result
709 ) {
710   uint32_t base_index = self->wildcard_root_pattern_count;
711   uint32_t size = self->pattern_map.size - base_index;
712   if (size == 0) {
713     *result = base_index;
714     return false;
715   }
716   while (size > 1) {
717     uint32_t half_size = size / 2;
718     uint32_t mid_index = base_index + half_size;
719     TSSymbol mid_symbol = self->steps.contents[
720       self->pattern_map.contents[mid_index].step_index
721     ].symbol;
722     if (needle > mid_symbol) base_index = mid_index;
723     size -= half_size;
724   }
725 
726   TSSymbol symbol = self->steps.contents[
727     self->pattern_map.contents[base_index].step_index
728   ].symbol;
729 
730   if (needle > symbol) {
731     base_index++;
732     if (base_index < self->pattern_map.size) {
733       symbol = self->steps.contents[
734         self->pattern_map.contents[base_index].step_index
735       ].symbol;
736     }
737   }
738 
739   *result = base_index;
740   return needle == symbol;
741 }
742 
743 // Insert a new pattern's start index into the pattern map, maintaining
744 // the pattern map's ordering invariant.
ts_query__pattern_map_insert(TSQuery * self,TSSymbol symbol,PatternEntry new_entry)745 static inline void ts_query__pattern_map_insert(
746   TSQuery *self,
747   TSSymbol symbol,
748   PatternEntry new_entry
749 ) {
750   uint32_t index;
751   ts_query__pattern_map_search(self, symbol, &index);
752 
753   // Ensure that the entries are sorted not only by symbol, but also
754   // by pattern_index. This way, states for earlier patterns will be
755   // initiated first, which allows the ordering of the states array
756   // to be maintained more efficiently.
757   while (index < self->pattern_map.size) {
758     PatternEntry *entry = &self->pattern_map.contents[index];
759     if (
760       self->steps.contents[entry->step_index].symbol == symbol &&
761       entry->pattern_index < new_entry.pattern_index
762     ) {
763       index++;
764     } else {
765       break;
766     }
767   }
768 
769   array_insert(&self->pattern_map, index, new_entry);
770 }
771 
ts_query__analyze_patterns(TSQuery * self,unsigned * error_offset)772 static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) {
773   // Walk forward through all of the steps in the query, computing some
774   // basic information about each step. Mark all of the steps that contain
775   // captures, and record the indices of all of the steps that have child steps.
776   Array(uint32_t) parent_step_indices = array_new();
777   for (unsigned i = 0; i < self->steps.size; i++) {
778     QueryStep *step = &self->steps.contents[i];
779     if (step->depth == PATTERN_DONE_MARKER) {
780       step->parent_pattern_guaranteed = true;
781       step->root_pattern_guaranteed = true;
782       continue;
783     }
784 
785     bool has_children = false;
786     bool is_wildcard = step->symbol == WILDCARD_SYMBOL;
787     step->contains_captures = step->capture_ids[0] != NONE;
788     for (unsigned j = i + 1; j < self->steps.size; j++) {
789       QueryStep *next_step = &self->steps.contents[j];
790       if (
791         next_step->depth == PATTERN_DONE_MARKER ||
792         next_step->depth <= step->depth
793       ) break;
794       if (next_step->capture_ids[0] != NONE) {
795         step->contains_captures = true;
796       }
797       if (!is_wildcard) {
798         next_step->root_pattern_guaranteed = true;
799         next_step->parent_pattern_guaranteed = true;
800       }
801       has_children = true;
802     }
803 
804     if (has_children && !is_wildcard) {
805       array_push(&parent_step_indices, i);
806     }
807   }
808 
809   // For every parent symbol in the query, initialize an 'analysis subgraph'.
810   // This subgraph lists all of the states in the parse table that are directly
811   // involved in building subtrees for this symbol.
812   //
813   // In addition to the parent symbols in the query, construct subgraphs for all
814   // of the hidden symbols in the grammar, because these might occur within
815   // one of the parent nodes, such that their children appear to belong to the
816   // parent.
817   Array(AnalysisSubgraph) subgraphs = array_new();
818   for (unsigned i = 0; i < parent_step_indices.size; i++) {
819     uint32_t parent_step_index = parent_step_indices.contents[i];
820     TSSymbol parent_symbol = self->steps.contents[parent_step_index].symbol;
821     AnalysisSubgraph subgraph = { .symbol = parent_symbol };
822     array_insert_sorted_by(&subgraphs, .symbol, subgraph);
823   }
824   for (TSSymbol sym = self->language->token_count; sym < self->language->symbol_count; sym++) {
825     if (!ts_language_symbol_metadata(self->language, sym).visible) {
826       AnalysisSubgraph subgraph = { .symbol = sym };
827       array_insert_sorted_by(&subgraphs, .symbol, subgraph);
828     }
829   }
830 
831   // Scan the parse table to find the data needed to populate these subgraphs.
832   // Collect three things during this scan:
833   //   1) All of the parse states where one of these symbols can start.
834   //   2) All of the parse states where one of these symbols can end, along
835   //      with information about the node that would be created.
836   //   3) A list of predecessor states for each state.
837   StatePredecessorMap predecessor_map = state_predecessor_map_new(self->language);
838   for (TSStateId state = 1; state < self->language->state_count; state++) {
839     unsigned subgraph_index, exists;
840     LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, state);
841     while (ts_lookahead_iterator_next(&lookahead_iterator)) {
842       if (lookahead_iterator.action_count) {
843         for (unsigned i = 0; i < lookahead_iterator.action_count; i++) {
844           const TSParseAction *action = &lookahead_iterator.actions[i];
845           if (action->type == TSParseActionTypeReduce) {
846             const TSSymbol *aliases, *aliases_end;
847             ts_language_aliases_for_symbol(
848               self->language,
849               action->reduce.symbol,
850               &aliases,
851               &aliases_end
852             );
853             for (const TSSymbol *symbol = aliases; symbol < aliases_end; symbol++) {
854               array_search_sorted_by(
855                 &subgraphs,
856                 .symbol,
857                 *symbol,
858                 &subgraph_index,
859                 &exists
860               );
861               if (exists) {
862                 AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
863                 if (subgraph->nodes.size == 0 || array_back(&subgraph->nodes)->state != state) {
864                   array_push(&subgraph->nodes, ((AnalysisSubgraphNode) {
865                     .state = state,
866                     .production_id = action->reduce.production_id,
867                     .child_index = action->reduce.child_count,
868                     .done = true,
869                   }));
870                 }
871               }
872             }
873           } else if (action->type == TSParseActionTypeShift && !action->shift.extra) {
874             TSStateId next_state = action->shift.state;
875             state_predecessor_map_add(&predecessor_map, next_state, state);
876           }
877         }
878       } else if (lookahead_iterator.next_state != 0) {
879         if (lookahead_iterator.next_state != state) {
880           state_predecessor_map_add(&predecessor_map, lookahead_iterator.next_state, state);
881         }
882         const TSSymbol *aliases, *aliases_end;
883         ts_language_aliases_for_symbol(
884           self->language,
885           lookahead_iterator.symbol,
886           &aliases,
887           &aliases_end
888         );
889         for (const TSSymbol *symbol = aliases; symbol < aliases_end; symbol++) {
890           array_search_sorted_by(
891             &subgraphs,
892             .symbol,
893             *symbol,
894             &subgraph_index,
895             &exists
896           );
897           if (exists) {
898             AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
899             if (
900               subgraph->start_states.size == 0 ||
901               *array_back(&subgraph->start_states) != state
902             )
903             array_push(&subgraph->start_states, state);
904           }
905         }
906       }
907     }
908   }
909 
910   // For each subgraph, compute the preceding states by walking backward
911   // from the end states using the predecessor map.
912   Array(AnalysisSubgraphNode) next_nodes = array_new();
913   for (unsigned i = 0; i < subgraphs.size; i++) {
914     AnalysisSubgraph *subgraph = &subgraphs.contents[i];
915     if (subgraph->nodes.size == 0) {
916       array_delete(&subgraph->start_states);
917       array_erase(&subgraphs, i);
918       i--;
919       continue;
920     }
921     array_assign(&next_nodes, &subgraph->nodes);
922     while (next_nodes.size > 0) {
923       AnalysisSubgraphNode node = array_pop(&next_nodes);
924       if (node.child_index > 1) {
925         unsigned predecessor_count;
926         const TSStateId *predecessors = state_predecessor_map_get(
927           &predecessor_map,
928           node.state,
929           &predecessor_count
930         );
931         for (unsigned j = 0; j < predecessor_count; j++) {
932           AnalysisSubgraphNode predecessor_node = {
933             .state = predecessors[j],
934             .child_index = node.child_index - 1,
935             .production_id = node.production_id,
936             .done = false,
937           };
938           unsigned index, exists;
939           array_search_sorted_with(
940             &subgraph->nodes, analysis_subgraph_node__compare, &predecessor_node,
941             &index, &exists
942           );
943           if (!exists) {
944             array_insert(&subgraph->nodes, index, predecessor_node);
945             array_push(&next_nodes, predecessor_node);
946           }
947         }
948       }
949     }
950   }
951 
952   #ifdef DEBUG_ANALYZE_QUERY
953     printf("\nSubgraphs:\n");
954     for (unsigned i = 0; i < subgraphs.size; i++) {
955       AnalysisSubgraph *subgraph = &subgraphs.contents[i];
956       printf("  %u, %s:\n", subgraph->symbol, ts_language_symbol_name(self->language, subgraph->symbol));
957       for (unsigned j = 0; j < subgraph->start_states.size; j++) {
958         printf(
959           "    {state: %u}\n",
960           subgraph->start_states.contents[j]
961         );
962       }
963       for (unsigned j = 0; j < subgraph->nodes.size; j++) {
964         AnalysisSubgraphNode *node = &subgraph->nodes.contents[j];
965         printf(
966           "    {state: %u, child_index: %u, production_id: %u, done: %d}\n",
967           node->state, node->child_index, node->production_id, node->done
968         );
969       }
970       printf("\n");
971     }
972   #endif
973 
974   // For each non-terminal pattern, determine if the pattern can successfully match,
975   // and identify all of the possible children within the pattern where matching could fail.
976   bool all_patterns_are_valid = true;
977   AnalysisStateSet states = array_new();
978   AnalysisStateSet next_states = array_new();
979   AnalysisStateSet deeper_states = array_new();
980   Array(uint16_t) final_step_indices = array_new();
981   for (unsigned i = 0; i < parent_step_indices.size; i++) {
982     uint16_t parent_step_index = parent_step_indices.contents[i];
983     uint16_t parent_depth = self->steps.contents[parent_step_index].depth;
984     TSSymbol parent_symbol = self->steps.contents[parent_step_index].symbol;
985     if (parent_symbol == ts_builtin_sym_error) continue;
986 
987     // Find the subgraph that corresponds to this pattern's root symbol. If the pattern's
988     // root symbol is a terminal, then return an error.
989     unsigned subgraph_index, exists;
990     array_search_sorted_by(&subgraphs, .symbol, parent_symbol, &subgraph_index, &exists);
991     if (!exists) {
992       unsigned first_child_step_index = parent_step_index + 1;
993       uint32_t i, exists;
994       array_search_sorted_by(&self->step_offsets, .step_index, first_child_step_index, &i, &exists);
995       assert(exists);
996       *error_offset = self->step_offsets.contents[i].byte_offset;
997       all_patterns_are_valid = false;
998       break;
999     }
1000 
1001     // Initialize an analysis state at every parse state in the table where
1002     // this parent symbol can occur.
1003     AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
1004     array_clear(&states);
1005     array_clear(&deeper_states);
1006     for (unsigned j = 0; j < subgraph->start_states.size; j++) {
1007       TSStateId parse_state = subgraph->start_states.contents[j];
1008       array_push(&states, ((AnalysisState) {
1009         .step_index = parent_step_index + 1,
1010         .stack = {
1011           [0] = {
1012             .parse_state = parse_state,
1013             .parent_symbol = parent_symbol,
1014             .child_index = 0,
1015             .field_id = 0,
1016             .done = false,
1017           },
1018         },
1019         .depth = 1,
1020       }));
1021     }
1022 
1023     // Walk the subgraph for this non-terminal, tracking all of the possible
1024     // sequences of progress within the pattern.
1025     bool can_finish_pattern = false;
1026     bool did_exceed_max_depth = false;
1027     unsigned recursion_depth_limit = 0;
1028     unsigned prev_final_step_count = 0;
1029     array_clear(&final_step_indices);
1030     for (;;) {
1031       #ifdef DEBUG_ANALYZE_QUERY
1032         printf("Final step indices:");
1033         for (unsigned j = 0; j < final_step_indices.size; j++) {
1034           printf(" %4u", final_step_indices.contents[j]);
1035         }
1036         printf("\nWalk states for %u %s:\n", i, ts_language_symbol_name(self->language, parent_symbol));
1037         for (unsigned j = 0; j < states.size; j++) {
1038           AnalysisState *state = &states.contents[j];
1039           printf("  %3u: step: %u, stack: [", j, state->step_index);
1040           for (unsigned k = 0; k < state->depth; k++) {
1041             printf(
1042               " {%s, child: %u, state: %4u",
1043               self->language->symbol_names[state->stack[k].parent_symbol],
1044               state->stack[k].child_index,
1045               state->stack[k].parse_state
1046             );
1047             if (state->stack[k].field_id) printf(", field: %s", self->language->field_names[state->stack[k].field_id]);
1048             if (state->stack[k].done) printf(", DONE");
1049             printf("}");
1050           }
1051           printf(" ]\n");
1052         }
1053       #endif
1054 
1055       // If no further progress can be made within the current recursion depth limit, then
1056       // bump the depth limit by one, and continue to process the states the exceeded the
1057       // limit. But only allow this if progress has been made since the last time the depth
1058       // limit was increased.
1059       if (states.size == 0) {
1060         if (deeper_states.size > 0 && final_step_indices.size > prev_final_step_count) {
1061           #ifdef DEBUG_ANALYZE_QUERY
1062             printf("Increase recursion depth limit to %u\n", recursion_depth_limit + 1);
1063           #endif
1064 
1065           prev_final_step_count = final_step_indices.size;
1066           recursion_depth_limit++;
1067           AnalysisStateSet _states = states;
1068           states = deeper_states;
1069           deeper_states = _states;
1070           continue;
1071         }
1072 
1073         break;
1074       }
1075 
1076       array_clear(&next_states);
1077       for (unsigned j = 0; j < states.size; j++) {
1078         AnalysisState * const state = &states.contents[j];
1079 
1080         // For efficiency, it's important to avoid processing the same analysis state more
1081         // than once. To achieve this, keep the states in order of ascending position within
1082         // their hypothetical syntax trees. In each iteration of this loop, start by advancing
1083         // the states that have made the least progress. Avoid advancing states that have already
1084         // made more progress.
1085         if (next_states.size > 0) {
1086           int comparison = analysis_state__compare_position(state, array_back(&next_states));
1087           if (comparison == 0) {
1088             array_insert_sorted_with(&next_states, analysis_state__compare, *state);
1089             continue;
1090           } else if (comparison > 0) {
1091             while (j < states.size) {
1092               array_push(&next_states, states.contents[j]);
1093               j++;
1094             }
1095             break;
1096           }
1097         }
1098 
1099         const TSStateId parse_state = analysis_state__top(state)->parse_state;
1100         const TSSymbol parent_symbol = analysis_state__top(state)->parent_symbol;
1101         const TSFieldId parent_field_id = analysis_state__top(state)->field_id;
1102         const unsigned child_index = analysis_state__top(state)->child_index;
1103         const QueryStep * const step = &self->steps.contents[state->step_index];
1104 
1105         unsigned subgraph_index, exists;
1106         array_search_sorted_by(&subgraphs, .symbol, parent_symbol, &subgraph_index, &exists);
1107         if (!exists) continue;
1108         const AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
1109 
1110         // Follow every possible path in the parse table, but only visit states that
1111         // are part of the subgraph for the current symbol.
1112         LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, parse_state);
1113         while (ts_lookahead_iterator_next(&lookahead_iterator)) {
1114           TSSymbol sym = lookahead_iterator.symbol;
1115 
1116           AnalysisSubgraphNode successor = {
1117             .state = parse_state,
1118             .child_index = child_index,
1119           };
1120           if (lookahead_iterator.action_count) {
1121             const TSParseAction *action = &lookahead_iterator.actions[lookahead_iterator.action_count - 1];
1122             if (action->type == TSParseActionTypeShift) {
1123               if (!action->shift.extra) {
1124                 successor.state = action->shift.state;
1125                 successor.child_index++;
1126               }
1127             } else {
1128               continue;
1129             }
1130           } else if (lookahead_iterator.next_state != 0) {
1131             successor.state = lookahead_iterator.next_state;
1132             successor.child_index++;
1133           } else {
1134             continue;
1135           }
1136 
1137           unsigned node_index;
1138           array_search_sorted_with(
1139             &subgraph->nodes,
1140             analysis_subgraph_node__compare, &successor,
1141             &node_index, &exists
1142           );
1143           while (node_index < subgraph->nodes.size) {
1144             AnalysisSubgraphNode *node = &subgraph->nodes.contents[node_index++];
1145             if (node->state != successor.state || node->child_index != successor.child_index) break;
1146 
1147             // Use the subgraph to determine what alias and field will eventually be applied
1148             // to this child node.
1149             TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index);
1150             TSSymbol visible_symbol = alias
1151               ? alias
1152               : self->language->symbol_metadata[sym].visible
1153                 ? self->language->public_symbol_map[sym]
1154                 : 0;
1155             TSFieldId field_id = parent_field_id;
1156             if (!field_id) {
1157               const TSFieldMapEntry *field_map, *field_map_end;
1158               ts_language_field_map(self->language, node->production_id, &field_map, &field_map_end);
1159               for (; field_map != field_map_end; field_map++) {
1160                 if (!field_map->inherited && field_map->child_index == child_index) {
1161                   field_id = field_map->field_id;
1162                   break;
1163                 }
1164               }
1165             }
1166 
1167             // Create a new state that has advanced past this hypothetical subtree.
1168             AnalysisState next_state = *state;
1169             AnalysisStateEntry *next_state_top = analysis_state__top(&next_state);
1170             next_state_top->child_index = successor.child_index;
1171             next_state_top->parse_state = successor.state;
1172             if (node->done) next_state_top->done = true;
1173 
1174             // Determine if this hypothetical child node would match the current step
1175             // of the query pattern.
1176             bool does_match = false;
1177             if (visible_symbol) {
1178               does_match = true;
1179               if (step->symbol == WILDCARD_SYMBOL) {
1180                 if (
1181                   step->is_named &&
1182                   !self->language->symbol_metadata[visible_symbol].named
1183                 ) does_match = false;
1184               } else if (step->symbol != visible_symbol) {
1185                 does_match = false;
1186               }
1187               if (step->field && step->field != field_id) {
1188                 does_match = false;
1189               }
1190               if (
1191                 step->supertype_symbol &&
1192                 !analysis_state__has_supertype(state, step->supertype_symbol)
1193               ) does_match = false;
1194             }
1195 
1196             // If this child is hidden, then descend into it and walk through its children.
1197             // If the top entry of the stack is at the end of its rule, then that entry can
1198             // be replaced. Otherwise, push a new entry onto the stack.
1199             else if (sym >= self->language->token_count) {
1200               if (!next_state_top->done) {
1201                 if (next_state.depth + 1 >= MAX_ANALYSIS_STATE_DEPTH) {
1202                   #ifdef DEBUG_ANALYZE_QUERY
1203                     printf("Exceeded depth limit for state %u\n", j);
1204                   #endif
1205 
1206                   did_exceed_max_depth = true;
1207                   continue;
1208                 }
1209 
1210                 next_state.depth++;
1211                 next_state_top = analysis_state__top(&next_state);
1212               }
1213 
1214               *next_state_top = (AnalysisStateEntry) {
1215                 .parse_state = parse_state,
1216                 .parent_symbol = sym,
1217                 .child_index = 0,
1218                 .field_id = field_id,
1219                 .done = false,
1220               };
1221 
1222               if (analysis_state__recursion_depth(&next_state) > recursion_depth_limit) {
1223                 array_insert_sorted_with(&deeper_states, analysis_state__compare, next_state);
1224                 continue;
1225               }
1226             }
1227 
1228             // Pop from the stack when this state reached the end of its current syntax node.
1229             while (next_state.depth > 0 && next_state_top->done) {
1230               next_state.depth--;
1231               next_state_top = analysis_state__top(&next_state);
1232             }
1233 
1234             // If this hypothetical child did match the current step of the query pattern,
1235             // then advance to the next step at the current depth. This involves skipping
1236             // over any descendant steps of the current child.
1237             const QueryStep *next_step = step;
1238             if (does_match) {
1239               for (;;) {
1240                 next_state.step_index++;
1241                 next_step = &self->steps.contents[next_state.step_index];
1242                 if (
1243                   next_step->depth == PATTERN_DONE_MARKER ||
1244                   next_step->depth <= parent_depth + 1
1245                 ) break;
1246               }
1247             } else if (successor.state == parse_state) {
1248               continue;
1249             }
1250 
1251             for (;;) {
1252               // Skip pass-through states. Although these states have alternatives, they are only
1253               // used to implement repetitions, and query analysis does not need to process
1254               // repetitions in order to determine whether steps are possible and definite.
1255               if (next_step->is_pass_through) {
1256                 next_state.step_index++;
1257                 next_step++;
1258                 continue;
1259               }
1260 
1261               // If the pattern is finished or hypothetical parent node is complete, then
1262               // record that matching can terminate at this step of the pattern. Otherwise,
1263               // add this state to the list of states to process on the next iteration.
1264               if (!next_step->is_dead_end) {
1265                 bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != parent_depth + 1;
1266                 if (did_finish_pattern) can_finish_pattern = true;
1267                 if (did_finish_pattern || next_state.depth == 0) {
1268                   array_insert_sorted_by(&final_step_indices, , next_state.step_index);
1269                 } else {
1270                   array_insert_sorted_with(&next_states, analysis_state__compare, next_state);
1271                 }
1272               }
1273 
1274               // If the state has advanced to a step with an alternative step, then add another state
1275               // at that alternative step. This process is simpler than the process of actually matching a
1276               // pattern during query exection, because for the purposes of query analysis, there is no
1277               // need to process repetitions.
1278               if (
1279                 does_match &&
1280                 next_step->alternative_index != NONE &&
1281                 next_step->alternative_index > next_state.step_index
1282               ) {
1283                 next_state.step_index = next_step->alternative_index;
1284                 next_step = &self->steps.contents[next_state.step_index];
1285               } else {
1286                 break;
1287               }
1288             }
1289           }
1290         }
1291       }
1292 
1293       AnalysisStateSet _states = states;
1294       states = next_states;
1295       next_states = _states;
1296     }
1297 
1298     // Mark as indefinite any step where a match terminated.
1299     // Later, this property will be propagated to all of the step's predecessors.
1300     for (unsigned j = 0; j < final_step_indices.size; j++) {
1301       uint32_t final_step_index = final_step_indices.contents[j];
1302       QueryStep *step = &self->steps.contents[final_step_index];
1303       if (
1304         step->depth != PATTERN_DONE_MARKER &&
1305         step->depth > parent_depth &&
1306         !step->is_dead_end
1307       ) {
1308         step->parent_pattern_guaranteed = false;
1309         step->root_pattern_guaranteed = false;
1310       }
1311     }
1312 
1313     if (did_exceed_max_depth) {
1314       for (unsigned j = parent_step_index + 1; j < self->steps.size; j++) {
1315         QueryStep *step = &self->steps.contents[j];
1316         if (
1317           step->depth <= parent_depth ||
1318           step->depth == PATTERN_DONE_MARKER
1319         ) break;
1320         if (!step->is_dead_end) {
1321           step->parent_pattern_guaranteed = false;
1322           step->root_pattern_guaranteed = false;
1323         }
1324       }
1325     }
1326 
1327     // If this pattern cannot match, store the pattern index so that it can be
1328     // returned to the caller.
1329     if (all_patterns_are_valid && !can_finish_pattern && !did_exceed_max_depth) {
1330       assert(final_step_indices.size > 0);
1331       uint16_t impossible_step_index = *array_back(&final_step_indices);
1332       uint32_t i, exists;
1333       array_search_sorted_by(&self->step_offsets, .step_index, impossible_step_index, &i, &exists);
1334       if (i >= self->step_offsets.size) i = self->step_offsets.size - 1;
1335       *error_offset = self->step_offsets.contents[i].byte_offset;
1336       all_patterns_are_valid = false;
1337       break;
1338     }
1339   }
1340 
1341   // Mark as indefinite any step with captures that are used in predicates.
1342   Array(uint16_t) predicate_capture_ids = array_new();
1343   for (unsigned i = 0; i < self->patterns.size; i++) {
1344     QueryPattern *pattern = &self->patterns.contents[i];
1345 
1346     // Gather all of the captures that are used in predicates for this pattern.
1347     array_clear(&predicate_capture_ids);
1348     for (
1349       unsigned start = pattern->predicate_steps.offset,
1350       end = start + pattern->predicate_steps.length,
1351       j = start; j < end; j++
1352     ) {
1353       TSQueryPredicateStep *step = &self->predicate_steps.contents[j];
1354       if (step->type == TSQueryPredicateStepTypeCapture) {
1355         array_insert_sorted_by(&predicate_capture_ids, , step->value_id);
1356       }
1357     }
1358 
1359     // Find all of the steps that have these captures.
1360     for (
1361       unsigned start = pattern->steps.offset,
1362       end = start + pattern->steps.length,
1363       j = start; j < end; j++
1364     ) {
1365       QueryStep *step = &self->steps.contents[j];
1366       for (unsigned k = 0; k < MAX_STEP_CAPTURE_COUNT; k++) {
1367         uint16_t capture_id = step->capture_ids[k];
1368         if (capture_id == NONE) break;
1369         unsigned index, exists;
1370         array_search_sorted_by(&predicate_capture_ids, , capture_id, &index, &exists);
1371         if (exists) {
1372           step->root_pattern_guaranteed = false;
1373           break;
1374         }
1375       }
1376     }
1377   }
1378 
1379   // Propagate fallibility. If a pattern is fallible at a given step, then it is
1380   // fallible at all of its preceding steps.
1381   bool done = self->steps.size == 0;
1382   while (!done) {
1383     done = true;
1384     for (unsigned i = self->steps.size - 1; i > 0; i--) {
1385       QueryStep *step = &self->steps.contents[i];
1386       if (step->depth == PATTERN_DONE_MARKER) continue;
1387 
1388       // Determine if this step is definite or has definite alternatives.
1389       bool parent_pattern_guaranteed = false;
1390       for (;;) {
1391         if (step->root_pattern_guaranteed) {
1392           parent_pattern_guaranteed = true;
1393           break;
1394         }
1395         if (step->alternative_index == NONE || step->alternative_index < i) {
1396           break;
1397         }
1398         step = &self->steps.contents[step->alternative_index];
1399       }
1400 
1401       // If not, mark its predecessor as indefinite.
1402       if (!parent_pattern_guaranteed) {
1403         QueryStep *prev_step = &self->steps.contents[i - 1];
1404         if (
1405           !prev_step->is_dead_end &&
1406           prev_step->depth != PATTERN_DONE_MARKER &&
1407           prev_step->root_pattern_guaranteed
1408         ) {
1409           prev_step->root_pattern_guaranteed = false;
1410           done = false;
1411         }
1412       }
1413     }
1414   }
1415 
1416   #ifdef DEBUG_ANALYZE_QUERY
1417     printf("Steps:\n");
1418     for (unsigned i = 0; i < self->steps.size; i++) {
1419       QueryStep *step = &self->steps.contents[i];
1420       if (step->depth == PATTERN_DONE_MARKER) {
1421         printf("  %u: DONE\n", i);
1422       } else {
1423         printf(
1424           "  %u: {symbol: %s, field: %s, depth: %u, parent_pattern_guaranteed: %d, root_pattern_guaranteed: %d}\n",
1425           i,
1426           (step->symbol == WILDCARD_SYMBOL)
1427             ? "ANY"
1428             : ts_language_symbol_name(self->language, step->symbol),
1429           (step->field ? ts_language_field_name_for_id(self->language, step->field) : "-"),
1430           step->depth,
1431           step->parent_pattern_guaranteed,
1432           step->root_pattern_guaranteed
1433         );
1434       }
1435     }
1436   #endif
1437 
1438   // Cleanup
1439   for (unsigned i = 0; i < subgraphs.size; i++) {
1440     array_delete(&subgraphs.contents[i].start_states);
1441     array_delete(&subgraphs.contents[i].nodes);
1442   }
1443   array_delete(&subgraphs);
1444   array_delete(&next_nodes);
1445   array_delete(&states);
1446   array_delete(&next_states);
1447   array_delete(&deeper_states);
1448   array_delete(&final_step_indices);
1449   array_delete(&parent_step_indices);
1450   array_delete(&predicate_capture_ids);
1451   state_predecessor_map_delete(&predecessor_map);
1452 
1453   return all_patterns_are_valid;
1454 }
1455 
ts_query__add_negated_fields(TSQuery * self,uint16_t step_index,TSFieldId * field_ids,uint16_t field_count)1456 static void ts_query__add_negated_fields(
1457   TSQuery *self,
1458   uint16_t step_index,
1459   TSFieldId *field_ids,
1460   uint16_t field_count
1461 ) {
1462   QueryStep *step = &self->steps.contents[step_index];
1463 
1464   // The negated field array stores a list of field lists, separated by zeros.
1465   // Try to find the start index of an existing list that matches this new list.
1466   bool failed_match = false;
1467   unsigned match_count = 0;
1468   unsigned start_i = 0;
1469   for (unsigned i = 0; i < self->negated_fields.size; i++) {
1470     TSFieldId existing_field_id = self->negated_fields.contents[i];
1471 
1472     // At each zero value, terminate the match attempt. If we've exactly
1473     // matched the new field list, then reuse this index. Otherwise,
1474     // start over the matching process.
1475     if (existing_field_id == 0) {
1476       if (match_count == field_count) {
1477         step->negated_field_list_id = start_i;
1478         return;
1479       } else {
1480         start_i = i + 1;
1481         match_count = 0;
1482         failed_match = false;
1483       }
1484     }
1485 
1486     // If the existing list matches our new list so far, then advance
1487     // to the next element of the new list.
1488     else if (
1489       match_count < field_count &&
1490       existing_field_id == field_ids[match_count] &&
1491       !failed_match
1492     ) {
1493       match_count++;
1494     }
1495 
1496     // Otherwise, this existing list has failed to match.
1497     else {
1498       match_count = 0;
1499       failed_match = true;
1500     }
1501   }
1502 
1503   step->negated_field_list_id = self->negated_fields.size;
1504   array_extend(&self->negated_fields, field_count, field_ids);
1505   array_push(&self->negated_fields, 0);
1506 }
1507 
ts_query__parse_string_literal(TSQuery * self,Stream * stream)1508 static TSQueryError ts_query__parse_string_literal(
1509   TSQuery *self,
1510   Stream *stream
1511 ) {
1512   const char *string_start = stream->input;
1513   if (stream->next != '"') return TSQueryErrorSyntax;
1514   stream_advance(stream);
1515   const char *prev_position = stream->input;
1516 
1517   bool is_escaped = false;
1518   array_clear(&self->string_buffer);
1519   for (;;) {
1520     if (is_escaped) {
1521       is_escaped = false;
1522       switch (stream->next) {
1523         case 'n':
1524           array_push(&self->string_buffer, '\n');
1525           break;
1526         case 'r':
1527           array_push(&self->string_buffer, '\r');
1528           break;
1529         case 't':
1530           array_push(&self->string_buffer, '\t');
1531           break;
1532         case '0':
1533           array_push(&self->string_buffer, '\0');
1534           break;
1535         default:
1536           array_extend(&self->string_buffer, stream->next_size, stream->input);
1537           break;
1538       }
1539       prev_position = stream->input + stream->next_size;
1540     } else {
1541       if (stream->next == '\\') {
1542         array_extend(&self->string_buffer, (stream->input - prev_position), prev_position);
1543         prev_position = stream->input + 1;
1544         is_escaped = true;
1545       } else if (stream->next == '"') {
1546         array_extend(&self->string_buffer, (stream->input - prev_position), prev_position);
1547         stream_advance(stream);
1548         return TSQueryErrorNone;
1549       } else if (stream->next == '\n') {
1550         stream_reset(stream, string_start);
1551         return TSQueryErrorSyntax;
1552       }
1553     }
1554     if (!stream_advance(stream)) {
1555       stream_reset(stream, string_start);
1556       return TSQueryErrorSyntax;
1557     }
1558   }
1559 }
1560 
1561 // Parse a single predicate associated with a pattern, adding it to the
1562 // query's internal `predicate_steps` array. Predicates are arbitrary
1563 // S-expressions associated with a pattern which are meant to be handled at
1564 // a higher level of abstraction, such as the Rust/JavaScript bindings. They
1565 // can contain '@'-prefixed capture names, double-quoted strings, and bare
1566 // symbols, which also represent strings.
ts_query__parse_predicate(TSQuery * self,Stream * stream)1567 static TSQueryError ts_query__parse_predicate(
1568   TSQuery *self,
1569   Stream *stream
1570 ) {
1571   if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
1572   const char *predicate_name = stream->input;
1573   stream_scan_identifier(stream);
1574   uint32_t length = stream->input - predicate_name;
1575   uint16_t id = symbol_table_insert_name(
1576     &self->predicate_values,
1577     predicate_name,
1578     length
1579   );
1580   array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1581     .type = TSQueryPredicateStepTypeString,
1582     .value_id = id,
1583   }));
1584   stream_skip_whitespace(stream);
1585 
1586   for (;;) {
1587     if (stream->next == ')') {
1588       stream_advance(stream);
1589       stream_skip_whitespace(stream);
1590       array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1591         .type = TSQueryPredicateStepTypeDone,
1592         .value_id = 0,
1593       }));
1594       break;
1595     }
1596 
1597     // Parse an '@'-prefixed capture name
1598     else if (stream->next == '@') {
1599       stream_advance(stream);
1600 
1601       // Parse the capture name
1602       if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
1603       const char *capture_name = stream->input;
1604       stream_scan_identifier(stream);
1605       uint32_t length = stream->input - capture_name;
1606 
1607       // Add the capture id to the first step of the pattern
1608       int capture_id = symbol_table_id_for_name(
1609         &self->captures,
1610         capture_name,
1611         length
1612       );
1613       if (capture_id == -1) {
1614         stream_reset(stream, capture_name);
1615         return TSQueryErrorCapture;
1616       }
1617 
1618       array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1619         .type = TSQueryPredicateStepTypeCapture,
1620         .value_id = capture_id,
1621       }));
1622     }
1623 
1624     // Parse a string literal
1625     else if (stream->next == '"') {
1626       TSQueryError e = ts_query__parse_string_literal(self, stream);
1627       if (e) return e;
1628       uint16_t id = symbol_table_insert_name(
1629         &self->predicate_values,
1630         self->string_buffer.contents,
1631         self->string_buffer.size
1632       );
1633       array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1634         .type = TSQueryPredicateStepTypeString,
1635         .value_id = id,
1636       }));
1637     }
1638 
1639     // Parse a bare symbol
1640     else if (stream_is_ident_start(stream)) {
1641       const char *symbol_start = stream->input;
1642       stream_scan_identifier(stream);
1643       uint32_t length = stream->input - symbol_start;
1644       uint16_t id = symbol_table_insert_name(
1645         &self->predicate_values,
1646         symbol_start,
1647         length
1648       );
1649       array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1650         .type = TSQueryPredicateStepTypeString,
1651         .value_id = id,
1652       }));
1653     }
1654 
1655     else {
1656       return TSQueryErrorSyntax;
1657     }
1658 
1659     stream_skip_whitespace(stream);
1660   }
1661 
1662   return 0;
1663 }
1664 
1665 // Read one S-expression pattern from the stream, and incorporate it into
1666 // the query's internal state machine representation. For nested patterns,
1667 // this function calls itself recursively.
ts_query__parse_pattern(TSQuery * self,Stream * stream,uint32_t depth,bool is_immediate)1668 static TSQueryError ts_query__parse_pattern(
1669   TSQuery *self,
1670   Stream *stream,
1671   uint32_t depth,
1672   bool is_immediate
1673 ) {
1674   if (stream->next == 0) return TSQueryErrorSyntax;
1675   if (stream->next == ')' || stream->next == ']') return PARENT_DONE;
1676 
1677   const uint32_t starting_step_index = self->steps.size;
1678 
1679   // Store the byte offset of each step in the query.
1680   if (
1681     self->step_offsets.size == 0 ||
1682     array_back(&self->step_offsets)->step_index != starting_step_index
1683   ) {
1684     array_push(&self->step_offsets, ((StepOffset) {
1685       .step_index = starting_step_index,
1686       .byte_offset = stream_offset(stream),
1687     }));
1688   }
1689 
1690   // An open bracket is the start of an alternation.
1691   if (stream->next == '[') {
1692     stream_advance(stream);
1693     stream_skip_whitespace(stream);
1694 
1695     // Parse each branch, and add a placeholder step in between the branches.
1696     Array(uint32_t) branch_step_indices = array_new();
1697     for (;;) {
1698       uint32_t start_index = self->steps.size;
1699       TSQueryError e = ts_query__parse_pattern(
1700         self,
1701         stream,
1702         depth,
1703         is_immediate
1704       );
1705 
1706       if (e == PARENT_DONE) {
1707         if (stream->next == ']' && branch_step_indices.size > 0) {
1708           stream_advance(stream);
1709           break;
1710         }
1711         e = TSQueryErrorSyntax;
1712       }
1713       if (e) {
1714         array_delete(&branch_step_indices);
1715         return e;
1716       }
1717 
1718       array_push(&branch_step_indices, start_index);
1719       array_push(&self->steps, query_step__new(0, depth, false));
1720     }
1721     (void)array_pop(&self->steps);
1722 
1723     // For all of the branches except for the last one, add the subsequent branch as an
1724     // alternative, and link the end of the branch to the current end of the steps.
1725     for (unsigned i = 0; i < branch_step_indices.size - 1; i++) {
1726       uint32_t step_index = branch_step_indices.contents[i];
1727       uint32_t next_step_index = branch_step_indices.contents[i + 1];
1728       QueryStep *start_step = &self->steps.contents[step_index];
1729       QueryStep *end_step = &self->steps.contents[next_step_index - 1];
1730       start_step->alternative_index = next_step_index;
1731       end_step->alternative_index = self->steps.size;
1732       end_step->is_dead_end = true;
1733     }
1734 
1735     array_delete(&branch_step_indices);
1736   }
1737 
1738   // An open parenthesis can be the start of three possible constructs:
1739   // * A grouped sequence
1740   // * A predicate
1741   // * A named node
1742   else if (stream->next == '(') {
1743     stream_advance(stream);
1744     stream_skip_whitespace(stream);
1745 
1746     // If this parenthesis is followed by a node, then it represents a grouped sequence.
1747     if (stream->next == '(' || stream->next == '"' || stream->next == '[') {
1748       bool child_is_immediate = false;
1749       for (;;) {
1750         if (stream->next == '.') {
1751           child_is_immediate = true;
1752           stream_advance(stream);
1753           stream_skip_whitespace(stream);
1754         }
1755         TSQueryError e = ts_query__parse_pattern(
1756           self,
1757           stream,
1758           depth,
1759           child_is_immediate
1760         );
1761         if (e == PARENT_DONE) {
1762           if (stream->next == ')') {
1763             stream_advance(stream);
1764             break;
1765           }
1766           e = TSQueryErrorSyntax;
1767         }
1768         if (e) return e;
1769 
1770         child_is_immediate = false;
1771       }
1772     }
1773 
1774     // A dot/pound character indicates the start of a predicate.
1775     else if (stream->next == '.' || stream->next == '#') {
1776       stream_advance(stream);
1777       return ts_query__parse_predicate(self, stream);
1778     }
1779 
1780     // Otherwise, this parenthesis is the start of a named node.
1781     else {
1782       TSSymbol symbol;
1783 
1784       // Parse a normal node name
1785       if (stream_is_ident_start(stream)) {
1786         const char *node_name = stream->input;
1787         stream_scan_identifier(stream);
1788         uint32_t length = stream->input - node_name;
1789 
1790         // TODO - remove.
1791         // For temporary backward compatibility, handle predicates without the leading '#' sign.
1792         if (length > 0 && (node_name[length - 1] == '!' || node_name[length - 1] == '?')) {
1793           stream_reset(stream, node_name);
1794           return ts_query__parse_predicate(self, stream);
1795         }
1796 
1797         // Parse the wildcard symbol
1798         else if (length == 1 && node_name[0] == '_') {
1799           symbol = WILDCARD_SYMBOL;
1800         }
1801 
1802         else {
1803           symbol = ts_language_symbol_for_name(
1804             self->language,
1805             node_name,
1806             length,
1807             true
1808           );
1809           if (!symbol) {
1810             stream_reset(stream, node_name);
1811             return TSQueryErrorNodeType;
1812           }
1813         }
1814       } else {
1815         return TSQueryErrorSyntax;
1816       }
1817 
1818       // Add a step for the node.
1819       array_push(&self->steps, query_step__new(symbol, depth, is_immediate));
1820       QueryStep *step = array_back(&self->steps);
1821       if (ts_language_symbol_metadata(self->language, symbol).supertype) {
1822         step->supertype_symbol = step->symbol;
1823         step->symbol = WILDCARD_SYMBOL;
1824       }
1825       if (symbol == WILDCARD_SYMBOL) {
1826         step->is_named = true;
1827       }
1828 
1829       stream_skip_whitespace(stream);
1830 
1831       if (stream->next == '/') {
1832         stream_advance(stream);
1833         if (!stream_is_ident_start(stream)) {
1834           return TSQueryErrorSyntax;
1835         }
1836 
1837         const char *node_name = stream->input;
1838         stream_scan_identifier(stream);
1839         uint32_t length = stream->input - node_name;
1840 
1841         step->symbol = ts_language_symbol_for_name(
1842           self->language,
1843           node_name,
1844           length,
1845           true
1846         );
1847         if (!step->symbol) {
1848           stream_reset(stream, node_name);
1849           return TSQueryErrorNodeType;
1850         }
1851 
1852         stream_skip_whitespace(stream);
1853       }
1854 
1855       // Parse the child patterns
1856       bool child_is_immediate = false;
1857       uint16_t last_child_step_index = 0;
1858       uint16_t negated_field_count = 0;
1859       TSFieldId negated_field_ids[MAX_NEGATED_FIELD_COUNT];
1860       for (;;) {
1861         // Parse a negated field assertion
1862         if (stream->next == '!') {
1863           stream_advance(stream);
1864           stream_skip_whitespace(stream);
1865           if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
1866           const char *field_name = stream->input;
1867           stream_scan_identifier(stream);
1868           uint32_t length = stream->input - field_name;
1869           stream_skip_whitespace(stream);
1870 
1871           TSFieldId field_id = ts_language_field_id_for_name(
1872             self->language,
1873             field_name,
1874             length
1875           );
1876           if (!field_id) {
1877             stream->input = field_name;
1878             return TSQueryErrorField;
1879           }
1880 
1881           // Keep the field ids sorted.
1882           if (negated_field_count < MAX_NEGATED_FIELD_COUNT) {
1883             negated_field_ids[negated_field_count] = field_id;
1884             negated_field_count++;
1885           }
1886 
1887           continue;
1888         }
1889 
1890         // Parse a sibling anchor
1891         if (stream->next == '.') {
1892           child_is_immediate = true;
1893           stream_advance(stream);
1894           stream_skip_whitespace(stream);
1895         }
1896 
1897         uint16_t step_index = self->steps.size;
1898         TSQueryError e = ts_query__parse_pattern(
1899           self,
1900           stream,
1901           depth + 1,
1902           child_is_immediate
1903         );
1904         if (e == PARENT_DONE) {
1905           if (stream->next == ')') {
1906             if (child_is_immediate) {
1907               if (last_child_step_index == 0) return TSQueryErrorSyntax;
1908               self->steps.contents[last_child_step_index].is_last_child = true;
1909             }
1910 
1911             if (negated_field_count) {
1912               ts_query__add_negated_fields(
1913                 self,
1914                 starting_step_index,
1915                 negated_field_ids,
1916                 negated_field_count
1917               );
1918             }
1919 
1920             stream_advance(stream);
1921             break;
1922           }
1923           e = TSQueryErrorSyntax;
1924         }
1925         if (e) return e;
1926 
1927         last_child_step_index = step_index;
1928         child_is_immediate = false;
1929       }
1930     }
1931   }
1932 
1933   // Parse a wildcard pattern
1934   else if (stream->next == '_') {
1935     stream_advance(stream);
1936     stream_skip_whitespace(stream);
1937 
1938     // Add a step that matches any kind of node
1939     array_push(&self->steps, query_step__new(WILDCARD_SYMBOL, depth, is_immediate));
1940   }
1941 
1942   // Parse a double-quoted anonymous leaf node expression
1943   else if (stream->next == '"') {
1944     const char *string_start = stream->input;
1945     TSQueryError e = ts_query__parse_string_literal(self, stream);
1946     if (e) return e;
1947 
1948     // Add a step for the node
1949     TSSymbol symbol = ts_language_symbol_for_name(
1950       self->language,
1951       self->string_buffer.contents,
1952       self->string_buffer.size,
1953       false
1954     );
1955     if (!symbol) {
1956       stream_reset(stream, string_start + 1);
1957       return TSQueryErrorNodeType;
1958     }
1959     array_push(&self->steps, query_step__new(symbol, depth, is_immediate));
1960   }
1961 
1962   // Parse a field-prefixed pattern
1963   else if (stream_is_ident_start(stream)) {
1964     // Parse the field name
1965     const char *field_name = stream->input;
1966     stream_scan_identifier(stream);
1967     uint32_t length = stream->input - field_name;
1968     stream_skip_whitespace(stream);
1969 
1970     if (stream->next != ':') {
1971       stream_reset(stream, field_name);
1972       return TSQueryErrorSyntax;
1973     }
1974     stream_advance(stream);
1975     stream_skip_whitespace(stream);
1976 
1977     // Parse the pattern
1978     TSQueryError e = ts_query__parse_pattern(
1979       self,
1980       stream,
1981       depth,
1982       is_immediate
1983     );
1984     if (e == PARENT_DONE) return TSQueryErrorSyntax;
1985     if (e) return e;
1986 
1987     // Add the field name to the first step of the pattern
1988     TSFieldId field_id = ts_language_field_id_for_name(
1989       self->language,
1990       field_name,
1991       length
1992     );
1993     if (!field_id) {
1994       stream->input = field_name;
1995       return TSQueryErrorField;
1996     }
1997 
1998     uint32_t step_index = starting_step_index;
1999     QueryStep *step = &self->steps.contents[step_index];
2000     for (;;) {
2001       step->field = field_id;
2002       if (
2003         step->alternative_index != NONE &&
2004         step->alternative_index > step_index &&
2005         step->alternative_index < self->steps.size
2006       ) {
2007         step_index = step->alternative_index;
2008         step = &self->steps.contents[step_index];
2009       } else {
2010         break;
2011       }
2012     }
2013   }
2014 
2015   else {
2016     return TSQueryErrorSyntax;
2017   }
2018 
2019   stream_skip_whitespace(stream);
2020 
2021   // Parse suffixes modifiers for this pattern
2022   for (;;) {
2023     // Parse the one-or-more operator.
2024     if (stream->next == '+') {
2025       stream_advance(stream);
2026       stream_skip_whitespace(stream);
2027 
2028       QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false);
2029       repeat_step.alternative_index = starting_step_index;
2030       repeat_step.is_pass_through = true;
2031       repeat_step.alternative_is_immediate = true;
2032       array_push(&self->steps, repeat_step);
2033     }
2034 
2035     // Parse the zero-or-more repetition operator.
2036     else if (stream->next == '*') {
2037       stream_advance(stream);
2038       stream_skip_whitespace(stream);
2039 
2040       QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false);
2041       repeat_step.alternative_index = starting_step_index;
2042       repeat_step.is_pass_through = true;
2043       repeat_step.alternative_is_immediate = true;
2044       array_push(&self->steps, repeat_step);
2045 
2046       QueryStep *step = &self->steps.contents[starting_step_index];
2047       while (step->alternative_index != NONE) {
2048         step = &self->steps.contents[step->alternative_index];
2049       }
2050       step->alternative_index = self->steps.size;
2051     }
2052 
2053     // Parse the optional operator.
2054     else if (stream->next == '?') {
2055       stream_advance(stream);
2056       stream_skip_whitespace(stream);
2057 
2058       QueryStep *step = &self->steps.contents[starting_step_index];
2059       while (step->alternative_index != NONE) {
2060         step = &self->steps.contents[step->alternative_index];
2061       }
2062       step->alternative_index = self->steps.size;
2063     }
2064 
2065     // Parse an '@'-prefixed capture pattern
2066     else if (stream->next == '@') {
2067       stream_advance(stream);
2068       if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
2069       const char *capture_name = stream->input;
2070       stream_scan_identifier(stream);
2071       uint32_t length = stream->input - capture_name;
2072       stream_skip_whitespace(stream);
2073 
2074       // Add the capture id to the first step of the pattern
2075       uint16_t capture_id = symbol_table_insert_name(
2076         &self->captures,
2077         capture_name,
2078         length
2079       );
2080 
2081       uint32_t step_index = starting_step_index;
2082       for (;;) {
2083         QueryStep *step = &self->steps.contents[step_index];
2084         query_step__add_capture(step, capture_id);
2085         if (
2086           step->alternative_index != NONE &&
2087           step->alternative_index > step_index &&
2088           step->alternative_index < self->steps.size
2089         ) {
2090           step_index = step->alternative_index;
2091           step = &self->steps.contents[step_index];
2092         } else {
2093           break;
2094         }
2095       }
2096     }
2097 
2098     // No more suffix modifiers
2099     else {
2100       break;
2101     }
2102   }
2103 
2104   return 0;
2105 }
2106 
ts_query_new(const TSLanguage * language,const char * source,uint32_t source_len,uint32_t * error_offset,TSQueryError * error_type)2107 TSQuery *ts_query_new(
2108   const TSLanguage *language,
2109   const char *source,
2110   uint32_t source_len,
2111   uint32_t *error_offset,
2112   TSQueryError *error_type
2113 ) {
2114   if (
2115     !language ||
2116     language->version > TREE_SITTER_LANGUAGE_VERSION ||
2117     language->version < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION
2118   ) {
2119     *error_type = TSQueryErrorLanguage;
2120     return NULL;
2121   }
2122 
2123   TSQuery *self = ts_malloc(sizeof(TSQuery));
2124   *self = (TSQuery) {
2125     .steps = array_new(),
2126     .pattern_map = array_new(),
2127     .captures = symbol_table_new(),
2128     .predicate_values = symbol_table_new(),
2129     .predicate_steps = array_new(),
2130     .patterns = array_new(),
2131     .step_offsets = array_new(),
2132     .string_buffer = array_new(),
2133     .negated_fields = array_new(),
2134     .wildcard_root_pattern_count = 0,
2135     .language = language,
2136   };
2137 
2138   array_push(&self->negated_fields, 0);
2139 
2140   // Parse all of the S-expressions in the given string.
2141   Stream stream = stream_new(source, source_len);
2142   stream_skip_whitespace(&stream);
2143   while (stream.input < stream.end) {
2144     uint32_t pattern_index = self->patterns.size;
2145     uint32_t start_step_index = self->steps.size;
2146     uint32_t start_predicate_step_index = self->predicate_steps.size;
2147     array_push(&self->patterns, ((QueryPattern) {
2148       .steps = (Slice) {.offset = start_step_index},
2149       .predicate_steps = (Slice) {.offset = start_predicate_step_index},
2150       .start_byte = stream_offset(&stream),
2151     }));
2152     *error_type = ts_query__parse_pattern(self, &stream, 0, false);
2153     array_push(&self->steps, query_step__new(0, PATTERN_DONE_MARKER, false));
2154 
2155     QueryPattern *pattern = array_back(&self->patterns);
2156     pattern->steps.length = self->steps.size - start_step_index;
2157     pattern->predicate_steps.length = self->predicate_steps.size - start_predicate_step_index;
2158 
2159     // If any pattern could not be parsed, then report the error information
2160     // and terminate.
2161     if (*error_type) {
2162       if (*error_type == PARENT_DONE) *error_type = TSQueryErrorSyntax;
2163       *error_offset = stream_offset(&stream);
2164       ts_query_delete(self);
2165       return NULL;
2166     }
2167 
2168     // Maintain a map that can look up patterns for a given root symbol.
2169     uint16_t wildcard_root_alternative_index = NONE;
2170     for (;;) {
2171       QueryStep *step = &self->steps.contents[start_step_index];
2172 
2173       // If a pattern has a wildcard at its root, but it has a non-wildcard child,
2174       // then optimize the matching process by skipping matching the wildcard.
2175       // Later, during the matching process, the query cursor will check that
2176       // there is a parent node, and capture it if necessary.
2177       if (step->symbol == WILDCARD_SYMBOL && step->depth == 0 && !step->field) {
2178         QueryStep *second_step = &self->steps.contents[start_step_index + 1];
2179         if (second_step->symbol != WILDCARD_SYMBOL && second_step->depth == 1) {
2180           wildcard_root_alternative_index = step->alternative_index;
2181           start_step_index += 1;
2182           step = second_step;
2183         }
2184       }
2185 
2186       // Determine whether the pattern has a single root node. This affects
2187       // decisions about whether or not to start matching the pattern when
2188       // a query cursor has a range restriction.
2189       bool is_rooted = true;
2190       uint32_t start_depth = step->depth;
2191       for (uint32_t step_index = start_step_index + 1; step_index < self->steps.size; step_index++) {
2192         QueryStep *step = &self->steps.contents[step_index];
2193         if (step->depth == start_depth) {
2194           is_rooted = false;
2195           break;
2196         }
2197       }
2198 
2199       ts_query__pattern_map_insert(self, step->symbol, (PatternEntry) {
2200         .step_index = start_step_index,
2201         .pattern_index = pattern_index,
2202         .is_rooted = is_rooted
2203       });
2204       if (step->symbol == WILDCARD_SYMBOL) {
2205         self->wildcard_root_pattern_count++;
2206       }
2207 
2208       // If there are alternatives or options at the root of the pattern,
2209       // then add multiple entries to the pattern map.
2210       if (step->alternative_index != NONE) {
2211         start_step_index = step->alternative_index;
2212         step->alternative_index = NONE;
2213       } else if (wildcard_root_alternative_index != NONE) {
2214         start_step_index = wildcard_root_alternative_index;
2215         wildcard_root_alternative_index = NONE;
2216       } else {
2217         break;
2218       }
2219     }
2220   }
2221 
2222   if (!ts_query__analyze_patterns(self, error_offset)) {
2223     *error_type = TSQueryErrorStructure;
2224     ts_query_delete(self);
2225     return NULL;
2226   }
2227 
2228   array_delete(&self->string_buffer);
2229   return self;
2230 }
2231 
ts_query_delete(TSQuery * self)2232 void ts_query_delete(TSQuery *self) {
2233   if (self) {
2234     array_delete(&self->steps);
2235     array_delete(&self->pattern_map);
2236     array_delete(&self->predicate_steps);
2237     array_delete(&self->patterns);
2238     array_delete(&self->step_offsets);
2239     array_delete(&self->string_buffer);
2240     array_delete(&self->negated_fields);
2241     symbol_table_delete(&self->captures);
2242     symbol_table_delete(&self->predicate_values);
2243     ts_free(self);
2244   }
2245 }
2246 
ts_query_pattern_count(const TSQuery * self)2247 uint32_t ts_query_pattern_count(const TSQuery *self) {
2248   return self->patterns.size;
2249 }
2250 
ts_query_capture_count(const TSQuery * self)2251 uint32_t ts_query_capture_count(const TSQuery *self) {
2252   return self->captures.slices.size;
2253 }
2254 
ts_query_string_count(const TSQuery * self)2255 uint32_t ts_query_string_count(const TSQuery *self) {
2256   return self->predicate_values.slices.size;
2257 }
2258 
ts_query_capture_name_for_id(const TSQuery * self,uint32_t index,uint32_t * length)2259 const char *ts_query_capture_name_for_id(
2260   const TSQuery *self,
2261   uint32_t index,
2262   uint32_t *length
2263 ) {
2264   return symbol_table_name_for_id(&self->captures, index, length);
2265 }
2266 
ts_query_string_value_for_id(const TSQuery * self,uint32_t index,uint32_t * length)2267 const char *ts_query_string_value_for_id(
2268   const TSQuery *self,
2269   uint32_t index,
2270   uint32_t *length
2271 ) {
2272   return symbol_table_name_for_id(&self->predicate_values, index, length);
2273 }
2274 
ts_query_predicates_for_pattern(const TSQuery * self,uint32_t pattern_index,uint32_t * step_count)2275 const TSQueryPredicateStep *ts_query_predicates_for_pattern(
2276   const TSQuery *self,
2277   uint32_t pattern_index,
2278   uint32_t *step_count
2279 ) {
2280   Slice slice = self->patterns.contents[pattern_index].predicate_steps;
2281   *step_count = slice.length;
2282   if (self->predicate_steps.contents == NULL) {
2283     return NULL;
2284   }
2285   return &self->predicate_steps.contents[slice.offset];
2286 }
2287 
ts_query_start_byte_for_pattern(const TSQuery * self,uint32_t pattern_index)2288 uint32_t ts_query_start_byte_for_pattern(
2289   const TSQuery *self,
2290   uint32_t pattern_index
2291 ) {
2292   return self->patterns.contents[pattern_index].start_byte;
2293 }
2294 
ts_query_is_pattern_guaranteed_at_step(const TSQuery * self,uint32_t byte_offset)2295 bool ts_query_is_pattern_guaranteed_at_step(
2296   const TSQuery *self,
2297   uint32_t byte_offset
2298 ) {
2299   uint32_t step_index = UINT32_MAX;
2300   for (unsigned i = 0; i < self->step_offsets.size; i++) {
2301     StepOffset *step_offset = &self->step_offsets.contents[i];
2302     if (step_offset->byte_offset > byte_offset) break;
2303     step_index = step_offset->step_index;
2304   }
2305   if (step_index < self->steps.size) {
2306     return self->steps.contents[step_index].root_pattern_guaranteed;
2307   } else {
2308     return false;
2309   }
2310 }
2311 
ts_query__step_is_fallible(const TSQuery * self,uint16_t step_index)2312 bool ts_query__step_is_fallible(
2313   const TSQuery *self,
2314   uint16_t step_index
2315 ) {
2316   assert((uint32_t)step_index + 1 < self->steps.size);
2317   QueryStep *step = &self->steps.contents[step_index];
2318   QueryStep *next_step = &self->steps.contents[step_index + 1];
2319   return (
2320     next_step->depth != PATTERN_DONE_MARKER &&
2321     next_step->depth > step->depth &&
2322     !next_step->parent_pattern_guaranteed
2323   );
2324 }
2325 
ts_query_disable_capture(TSQuery * self,const char * name,uint32_t length)2326 void ts_query_disable_capture(
2327   TSQuery *self,
2328   const char *name,
2329   uint32_t length
2330 ) {
2331   // Remove capture information for any pattern step that previously
2332   // captured with the given name.
2333   int id = symbol_table_id_for_name(&self->captures, name, length);
2334   if (id != -1) {
2335     for (unsigned i = 0; i < self->steps.size; i++) {
2336       QueryStep *step = &self->steps.contents[i];
2337       query_step__remove_capture(step, id);
2338     }
2339   }
2340 }
2341 
ts_query_disable_pattern(TSQuery * self,uint32_t pattern_index)2342 void ts_query_disable_pattern(
2343   TSQuery *self,
2344   uint32_t pattern_index
2345 ) {
2346   // Remove the given pattern from the pattern map. Its steps will still
2347   // be in the `steps` array, but they will never be read.
2348   for (unsigned i = 0; i < self->pattern_map.size; i++) {
2349     PatternEntry *pattern = &self->pattern_map.contents[i];
2350     if (pattern->pattern_index == pattern_index) {
2351       array_erase(&self->pattern_map, i);
2352       i--;
2353     }
2354   }
2355 }
2356 
2357 /***************
2358  * QueryCursor
2359  ***************/
2360 
ts_query_cursor_new(void)2361 TSQueryCursor *ts_query_cursor_new(void) {
2362   TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor));
2363   *self = (TSQueryCursor) {
2364     .did_exceed_match_limit = false,
2365     .ascending = false,
2366     .halted = false,
2367     .states = array_new(),
2368     .finished_states = array_new(),
2369     .capture_list_pool = capture_list_pool_new(),
2370     .start_byte = 0,
2371     .end_byte = UINT32_MAX,
2372     .start_point = {0, 0},
2373     .end_point = POINT_MAX,
2374   };
2375   array_reserve(&self->states, 8);
2376   array_reserve(&self->finished_states, 8);
2377   return self;
2378 }
2379 
ts_query_cursor_delete(TSQueryCursor * self)2380 void ts_query_cursor_delete(TSQueryCursor *self) {
2381   array_delete(&self->states);
2382   array_delete(&self->finished_states);
2383   ts_tree_cursor_delete(&self->cursor);
2384   capture_list_pool_delete(&self->capture_list_pool);
2385   ts_free(self);
2386 }
2387 
ts_query_cursor_did_exceed_match_limit(const TSQueryCursor * self)2388 bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *self) {
2389   return self->did_exceed_match_limit;
2390 }
2391 
ts_query_cursor_match_limit(const TSQueryCursor * self)2392 uint32_t ts_query_cursor_match_limit(const TSQueryCursor *self) {
2393   return self->capture_list_pool.max_capture_list_count;
2394 }
2395 
ts_query_cursor_set_match_limit(TSQueryCursor * self,uint32_t limit)2396 void ts_query_cursor_set_match_limit(TSQueryCursor *self, uint32_t limit) {
2397   self->capture_list_pool.max_capture_list_count = limit;
2398 }
2399 
ts_query_cursor_exec(TSQueryCursor * self,const TSQuery * query,TSNode node)2400 void ts_query_cursor_exec(
2401   TSQueryCursor *self,
2402   const TSQuery *query,
2403   TSNode node
2404 ) {
2405   array_clear(&self->states);
2406   array_clear(&self->finished_states);
2407   ts_tree_cursor_reset(&self->cursor, node);
2408   capture_list_pool_reset(&self->capture_list_pool);
2409   self->next_state_id = 0;
2410   self->depth = 0;
2411   self->ascending = false;
2412   self->halted = false;
2413   self->query = query;
2414   self->did_exceed_match_limit = false;
2415 }
2416 
ts_query_cursor_set_byte_range(TSQueryCursor * self,uint32_t start_byte,uint32_t end_byte)2417 void ts_query_cursor_set_byte_range(
2418   TSQueryCursor *self,
2419   uint32_t start_byte,
2420   uint32_t end_byte
2421 ) {
2422   if (end_byte == 0) {
2423     end_byte = UINT32_MAX;
2424   }
2425   self->start_byte = start_byte;
2426   self->end_byte = end_byte;
2427 }
2428 
ts_query_cursor_set_point_range(TSQueryCursor * self,TSPoint start_point,TSPoint end_point)2429 void ts_query_cursor_set_point_range(
2430   TSQueryCursor *self,
2431   TSPoint start_point,
2432   TSPoint end_point
2433 ) {
2434   if (end_point.row == 0 && end_point.column == 0) {
2435     end_point = POINT_MAX;
2436   }
2437   self->start_point = start_point;
2438   self->end_point = end_point;
2439 }
2440 
2441 // Search through all of the in-progress states, and find the captured
2442 // node that occurs earliest in the document.
ts_query_cursor__first_in_progress_capture(TSQueryCursor * self,uint32_t * state_index,uint32_t * byte_offset,uint32_t * pattern_index,bool * root_pattern_guaranteed)2443 static bool ts_query_cursor__first_in_progress_capture(
2444   TSQueryCursor *self,
2445   uint32_t *state_index,
2446   uint32_t *byte_offset,
2447   uint32_t *pattern_index,
2448   bool *root_pattern_guaranteed
2449 ) {
2450   bool result = false;
2451   *state_index = UINT32_MAX;
2452   *byte_offset = UINT32_MAX;
2453   *pattern_index = UINT32_MAX;
2454   for (unsigned i = 0; i < self->states.size; i++) {
2455     QueryState *state = &self->states.contents[i];
2456     if (state->dead) continue;
2457 
2458     const CaptureList *captures = capture_list_pool_get(
2459       &self->capture_list_pool,
2460       state->capture_list_id
2461     );
2462     if (state->consumed_capture_count >= captures->size) {
2463       continue;
2464     }
2465 
2466     TSNode node = captures->contents[state->consumed_capture_count].node;
2467     if (
2468       ts_node_end_byte(node) <= self->start_byte ||
2469       point_lte(ts_node_end_point(node), self->start_point)
2470     ) {
2471       state->consumed_capture_count++;
2472       i--;
2473       continue;
2474     }
2475 
2476     uint32_t node_start_byte = ts_node_start_byte(node);
2477     if (
2478       !result ||
2479       node_start_byte < *byte_offset ||
2480       (node_start_byte == *byte_offset && state->pattern_index < *pattern_index)
2481     ) {
2482       QueryStep *step = &self->query->steps.contents[state->step_index];
2483       if (root_pattern_guaranteed) {
2484         *root_pattern_guaranteed = step->root_pattern_guaranteed;
2485       } else if (step->root_pattern_guaranteed) {
2486         continue;
2487       }
2488 
2489       result = true;
2490       *state_index = i;
2491       *byte_offset = node_start_byte;
2492       *pattern_index = state->pattern_index;
2493     }
2494   }
2495   return result;
2496 }
2497 
2498 // Determine which node is first in a depth-first traversal
ts_query_cursor__compare_nodes(TSNode left,TSNode right)2499 int ts_query_cursor__compare_nodes(TSNode left, TSNode right) {
2500   if (left.id != right.id) {
2501     uint32_t left_start = ts_node_start_byte(left);
2502     uint32_t right_start = ts_node_start_byte(right);
2503     if (left_start < right_start) return -1;
2504     if (left_start > right_start) return 1;
2505     uint32_t left_node_count = ts_node_end_byte(left);
2506     uint32_t right_node_count = ts_node_end_byte(right);
2507     if (left_node_count > right_node_count) return -1;
2508     if (left_node_count < right_node_count) return 1;
2509   }
2510   return 0;
2511 }
2512 
2513 // Determine if either state contains a superset of the other state's captures.
ts_query_cursor__compare_captures(TSQueryCursor * self,QueryState * left_state,QueryState * right_state,bool * left_contains_right,bool * right_contains_left)2514 void ts_query_cursor__compare_captures(
2515   TSQueryCursor *self,
2516   QueryState *left_state,
2517   QueryState *right_state,
2518   bool *left_contains_right,
2519   bool *right_contains_left
2520 ) {
2521   const CaptureList *left_captures = capture_list_pool_get(
2522     &self->capture_list_pool,
2523     left_state->capture_list_id
2524   );
2525   const CaptureList *right_captures = capture_list_pool_get(
2526     &self->capture_list_pool,
2527     right_state->capture_list_id
2528   );
2529   *left_contains_right = true;
2530   *right_contains_left = true;
2531   unsigned i = 0, j = 0;
2532   for (;;) {
2533     if (i < left_captures->size) {
2534       if (j < right_captures->size) {
2535         TSQueryCapture *left = &left_captures->contents[i];
2536         TSQueryCapture *right = &right_captures->contents[j];
2537         if (left->node.id == right->node.id && left->index == right->index) {
2538           i++;
2539           j++;
2540         } else {
2541           switch (ts_query_cursor__compare_nodes(left->node, right->node)) {
2542             case -1:
2543               *right_contains_left = false;
2544               i++;
2545               break;
2546             case 1:
2547               *left_contains_right = false;
2548               j++;
2549               break;
2550             default:
2551               *right_contains_left = false;
2552               *left_contains_right = false;
2553               i++;
2554               j++;
2555               break;
2556           }
2557         }
2558       } else {
2559         *right_contains_left = false;
2560         break;
2561       }
2562     } else {
2563       if (j < right_captures->size) {
2564         *left_contains_right = false;
2565       }
2566       break;
2567     }
2568   }
2569 }
2570 
2571 #ifdef DEBUG_EXECUTE_QUERY
2572 #define LOG(...) fprintf(stderr, __VA_ARGS__)
2573 #else
2574 #define LOG(...)
2575 #endif
2576 
ts_query_cursor__add_state(TSQueryCursor * self,const PatternEntry * pattern)2577 static void ts_query_cursor__add_state(
2578   TSQueryCursor *self,
2579   const PatternEntry *pattern
2580 ) {
2581   QueryStep *step = &self->query->steps.contents[pattern->step_index];
2582   uint32_t start_depth = self->depth - step->depth;
2583 
2584   // Keep the states array in ascending order of start_depth and pattern_index,
2585   // so that it can be processed more efficiently elsewhere. Usually, there is
2586   // no work to do here because of two facts:
2587   // * States with lower start_depth are naturally added first due to the
2588   //   order in which nodes are visited.
2589   // * Earlier patterns are naturally added first because of the ordering of the
2590   //   pattern_map data structure that's used to initiate matches.
2591   //
2592   // This loop is only needed in cases where two conditions hold:
2593   // * A pattern consists of more than one sibling node, so that its states
2594   //   remain in progress after exiting the node that started the match.
2595   // * The first node in the pattern matches against multiple nodes at the
2596   //   same depth.
2597   //
2598   // An example of this is the pattern '((comment)* (function))'. If multiple
2599   // `comment` nodes appear in a row, then we may initiate a new state for this
2600   // pattern while another state for the same pattern is already in progress.
2601   // If there are multiple patterns like this in a query, then this loop will
2602   // need to execute in order to keep the states ordered by pattern_index.
2603   uint32_t index = self->states.size;
2604   while (index > 0) {
2605     QueryState *prev_state = &self->states.contents[index - 1];
2606     if (prev_state->start_depth < start_depth) break;
2607     if (prev_state->start_depth == start_depth) {
2608       // Avoid inserting an unnecessary duplicate state, which would be
2609       // immediately pruned by the longest-match criteria.
2610       if (
2611         prev_state->pattern_index == pattern->pattern_index &&
2612         prev_state->step_index == pattern->step_index
2613       ) return;
2614       if (prev_state->pattern_index <= pattern->pattern_index) break;
2615     }
2616     index--;
2617   }
2618 
2619   LOG(
2620     "  start state. pattern:%u, step:%u\n",
2621     pattern->pattern_index,
2622     pattern->step_index
2623   );
2624   array_insert(&self->states, index, ((QueryState) {
2625     .id = UINT32_MAX,
2626     .capture_list_id = NONE,
2627     .step_index = pattern->step_index,
2628     .pattern_index = pattern->pattern_index,
2629     .start_depth = start_depth,
2630     .consumed_capture_count = 0,
2631     .seeking_immediate_match = true,
2632     .has_in_progress_alternatives = false,
2633     .needs_parent = step->depth == 1,
2634     .dead = false,
2635   }));
2636 }
2637 
2638 // Acquire a capture list for this state. If there are no capture lists left in the
2639 // pool, this will steal the capture list from another existing state, and mark that
2640 // other state as 'dead'.
ts_query_cursor__prepare_to_capture(TSQueryCursor * self,QueryState * state,unsigned state_index_to_preserve)2641 static CaptureList *ts_query_cursor__prepare_to_capture(
2642   TSQueryCursor *self,
2643   QueryState *state,
2644   unsigned state_index_to_preserve
2645 ) {
2646   if (state->capture_list_id == NONE) {
2647     state->capture_list_id = capture_list_pool_acquire(&self->capture_list_pool);
2648 
2649     // If there are no capture lists left in the pool, then terminate whichever
2650     // state has captured the earliest node in the document, and steal its
2651     // capture list.
2652     if (state->capture_list_id == NONE) {
2653       self->did_exceed_match_limit = true;
2654       uint32_t state_index, byte_offset, pattern_index;
2655       if (
2656         ts_query_cursor__first_in_progress_capture(
2657           self,
2658           &state_index,
2659           &byte_offset,
2660           &pattern_index,
2661           NULL
2662         ) &&
2663         state_index != state_index_to_preserve
2664       ) {
2665         LOG(
2666           "  abandon state. index:%u, pattern:%u, offset:%u.\n",
2667           state_index, pattern_index, byte_offset
2668         );
2669         QueryState *other_state = &self->states.contents[state_index];
2670         state->capture_list_id = other_state->capture_list_id;
2671         other_state->capture_list_id = NONE;
2672         other_state->dead = true;
2673         CaptureList *list = capture_list_pool_get_mut(
2674           &self->capture_list_pool,
2675           state->capture_list_id
2676         );
2677         array_clear(list);
2678         return list;
2679       } else {
2680         LOG("  ran out of capture lists");
2681         return NULL;
2682       }
2683     }
2684   }
2685   return capture_list_pool_get_mut(&self->capture_list_pool, state->capture_list_id);
2686 }
2687 
ts_query_cursor__capture(TSQueryCursor * self,QueryState * state,QueryStep * step,TSNode node)2688 static void ts_query_cursor__capture(
2689   TSQueryCursor *self,
2690   QueryState *state,
2691   QueryStep *step,
2692   TSNode node
2693 ) {
2694   if (state->dead) return;
2695   CaptureList *capture_list = ts_query_cursor__prepare_to_capture(self, state, UINT32_MAX);
2696   if (!capture_list) {
2697     state->dead = true;
2698     return;
2699   }
2700 
2701   for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) {
2702     uint16_t capture_id = step->capture_ids[j];
2703     if (step->capture_ids[j] == NONE) break;
2704     array_push(capture_list, ((TSQueryCapture) { node, capture_id }));
2705     LOG(
2706       "  capture node. type:%s, pattern:%u, capture_id:%u, capture_count:%u\n",
2707       ts_node_type(node),
2708       state->pattern_index,
2709       capture_id,
2710       capture_list->size
2711     );
2712   }
2713 }
2714 
2715 // Duplicate the given state and insert the newly-created state immediately after
2716 // the given state in the `states` array. Ensures that the given state reference is
2717 // still valid, even if the states array is reallocated.
ts_query_cursor__copy_state(TSQueryCursor * self,QueryState ** state_ref)2718 static QueryState *ts_query_cursor__copy_state(
2719   TSQueryCursor *self,
2720   QueryState **state_ref
2721 ) {
2722   const QueryState *state = *state_ref;
2723   uint32_t state_index = state - self->states.contents;
2724   QueryState copy = *state;
2725   copy.capture_list_id = NONE;
2726 
2727   // If the state has captures, copy its capture list.
2728   if (state->capture_list_id != NONE) {
2729     CaptureList *new_captures = ts_query_cursor__prepare_to_capture(self, &copy, state_index);
2730     if (!new_captures) return NULL;
2731     const CaptureList *old_captures = capture_list_pool_get(
2732       &self->capture_list_pool,
2733       state->capture_list_id
2734     );
2735     array_push_all(new_captures, old_captures);
2736   }
2737 
2738   array_insert(&self->states, state_index + 1, copy);
2739   *state_ref = &self->states.contents[state_index];
2740   return &self->states.contents[state_index + 1];
2741 }
2742 
2743 // Walk the tree, processing patterns until at least one pattern finishes,
2744 // If one or more patterns finish, return `true` and store their states in the
2745 // `finished_states` array. Multiple patterns can finish on the same node. If
2746 // there are no more matches, return `false`.
ts_query_cursor__advance(TSQueryCursor * self,bool stop_on_definite_step)2747 static inline bool ts_query_cursor__advance(
2748   TSQueryCursor *self,
2749   bool stop_on_definite_step
2750 ) {
2751   bool did_match = false;
2752   for (;;) {
2753     if (self->halted) {
2754       while (self->states.size > 0) {
2755         QueryState state = array_pop(&self->states);
2756         capture_list_pool_release(
2757           &self->capture_list_pool,
2758           state.capture_list_id
2759         );
2760       }
2761     }
2762 
2763     if (did_match || self->halted) return did_match;
2764 
2765     // Exit the current node.
2766     if (self->ascending) {
2767       LOG(
2768         "leave node. depth:%u, type:%s\n",
2769         self->depth,
2770         ts_node_type(ts_tree_cursor_current_node(&self->cursor))
2771       );
2772 
2773       // Leave this node by stepping to its next sibling or to its parent.
2774       if (ts_tree_cursor_goto_next_sibling(&self->cursor)) {
2775         self->ascending = false;
2776       } else if (ts_tree_cursor_goto_parent(&self->cursor)) {
2777         self->depth--;
2778       } else {
2779         LOG("halt at root\n");
2780         self->halted = true;
2781       }
2782 
2783       // After leaving a node, remove any states that cannot make further progress.
2784       uint32_t deleted_count = 0;
2785       for (unsigned i = 0, n = self->states.size; i < n; i++) {
2786         QueryState *state = &self->states.contents[i];
2787         QueryStep *step = &self->query->steps.contents[state->step_index];
2788 
2789         // If a state completed its pattern inside of this node, but was deferred from finishing
2790         // in order to search for longer matches, mark it as finished.
2791         if (step->depth == PATTERN_DONE_MARKER) {
2792           if (state->start_depth > self->depth || self->halted) {
2793             LOG("  finish pattern %u\n", state->pattern_index);
2794             array_push(&self->finished_states, *state);
2795             did_match = true;
2796             deleted_count++;
2797             continue;
2798           }
2799         }
2800 
2801         // If a state needed to match something within this node, then remove that state
2802         // as it has failed to match.
2803         else if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) {
2804           LOG(
2805             "  failed to match. pattern:%u, step:%u\n",
2806             state->pattern_index,
2807             state->step_index
2808           );
2809           capture_list_pool_release(
2810             &self->capture_list_pool,
2811             state->capture_list_id
2812           );
2813           deleted_count++;
2814           continue;
2815         }
2816 
2817         if (deleted_count > 0) {
2818           self->states.contents[i - deleted_count] = *state;
2819         }
2820       }
2821       self->states.size -= deleted_count;
2822     }
2823 
2824     // Enter a new node.
2825     else {
2826       // Get the properties of the current node.
2827       TSNode node = ts_tree_cursor_current_node(&self->cursor);
2828       TSNode parent_node = ts_tree_cursor_parent_node(&self->cursor);
2829       TSSymbol symbol = ts_node_symbol(node);
2830       bool is_named = ts_node_is_named(node);
2831       bool has_later_siblings;
2832       bool has_later_named_siblings;
2833       bool can_have_later_siblings_with_this_field;
2834       TSFieldId field_id = 0;
2835       TSSymbol supertypes[8] = {0};
2836       unsigned supertype_count = 8;
2837       ts_tree_cursor_current_status(
2838         &self->cursor,
2839         &field_id,
2840         &has_later_siblings,
2841         &has_later_named_siblings,
2842         &can_have_later_siblings_with_this_field,
2843         supertypes,
2844         &supertype_count
2845       );
2846       LOG(
2847         "enter node. depth:%u, type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u\n",
2848         self->depth,
2849         ts_node_type(node),
2850         ts_language_field_name_for_id(self->query->language, field_id),
2851         ts_node_start_point(node).row,
2852         self->states.size,
2853         self->finished_states.size
2854       );
2855 
2856       bool node_intersects_range = (
2857         ts_node_end_byte(node) > self->start_byte &&
2858         ts_node_start_byte(node) < self->end_byte &&
2859         point_gt(ts_node_end_point(node), self->start_point) &&
2860         point_lt(ts_node_start_point(node), self->end_point)
2861       );
2862 
2863       bool parent_intersects_range = ts_node_is_null(parent_node) || (
2864         ts_node_end_byte(parent_node) > self->start_byte &&
2865         ts_node_start_byte(parent_node) < self->end_byte &&
2866         point_gt(ts_node_end_point(parent_node), self->start_point) &&
2867         point_lt(ts_node_start_point(parent_node), self->end_point)
2868       );
2869 
2870         // Add new states for any patterns whose root node is a wildcard.
2871       for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) {
2872         PatternEntry *pattern = &self->query->pattern_map.contents[i];
2873 
2874         // If this node matches the first step of the pattern, then add a new
2875         // state at the start of this pattern.
2876         QueryStep *step = &self->query->steps.contents[pattern->step_index];
2877         if (
2878           (node_intersects_range || (!pattern->is_rooted && parent_intersects_range)) &&
2879           (!step->field || field_id == step->field) &&
2880           (!step->supertype_symbol || supertype_count > 0)
2881         ) {
2882           ts_query_cursor__add_state(self, pattern);
2883         }
2884       }
2885 
2886       // Add new states for any patterns whose root node matches this node.
2887       unsigned i;
2888       if (ts_query__pattern_map_search(self->query, symbol, &i)) {
2889         PatternEntry *pattern = &self->query->pattern_map.contents[i];
2890 
2891         QueryStep *step = &self->query->steps.contents[pattern->step_index];
2892         do {
2893           // If this node matches the first step of the pattern, then add a new
2894           // state at the start of this pattern.
2895           if (
2896             (node_intersects_range || (!pattern->is_rooted && parent_intersects_range)) &&
2897             (!step->field || field_id == step->field)
2898           ) {
2899             ts_query_cursor__add_state(self, pattern);
2900           }
2901 
2902           // Advance to the next pattern whose root node matches this node.
2903           i++;
2904           if (i == self->query->pattern_map.size) break;
2905           pattern = &self->query->pattern_map.contents[i];
2906           step = &self->query->steps.contents[pattern->step_index];
2907         } while (step->symbol == symbol);
2908       }
2909 
2910       // Update all of the in-progress states with current node.
2911       for (unsigned i = 0, copy_count = 0; i < self->states.size; i += 1 + copy_count) {
2912         QueryState *state = &self->states.contents[i];
2913         QueryStep *step = &self->query->steps.contents[state->step_index];
2914         state->has_in_progress_alternatives = false;
2915         copy_count = 0;
2916 
2917         // Check that the node matches all of the criteria for the next
2918         // step of the pattern.
2919         if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue;
2920 
2921         // Determine if this node matches this step of the pattern, and also
2922         // if this node can have later siblings that match this step of the
2923         // pattern.
2924         bool node_does_match = false;
2925         if (step->symbol == WILDCARD_SYMBOL) {
2926           node_does_match = is_named || !step->is_named;
2927         } else {
2928           node_does_match = symbol == step->symbol;
2929         }
2930         bool later_sibling_can_match = has_later_siblings;
2931         if ((step->is_immediate && is_named) || state->seeking_immediate_match) {
2932           later_sibling_can_match = false;
2933         }
2934         if (step->is_last_child && has_later_named_siblings) {
2935           node_does_match = false;
2936         }
2937         if (step->supertype_symbol) {
2938           bool has_supertype = false;
2939           for (unsigned j = 0; j < supertype_count; j++) {
2940             if (supertypes[j] == step->supertype_symbol) {
2941               has_supertype = true;
2942               break;
2943             }
2944           }
2945           if (!has_supertype) node_does_match = false;
2946         }
2947         if (step->field) {
2948           if (step->field == field_id) {
2949             if (!can_have_later_siblings_with_this_field) {
2950               later_sibling_can_match = false;
2951             }
2952           } else {
2953             node_does_match = false;
2954           }
2955         }
2956 
2957         if (step->negated_field_list_id) {
2958           TSFieldId *negated_field_ids = &self->query->negated_fields.contents[step->negated_field_list_id];
2959           for (;;) {
2960             TSFieldId negated_field_id = *negated_field_ids;
2961             if (negated_field_id) {
2962               negated_field_ids++;
2963               if (ts_node_child_by_field_id(node, negated_field_id).id) {
2964                 node_does_match = false;
2965                 break;
2966               }
2967             } else {
2968               break;
2969             }
2970           }
2971         }
2972 
2973         // Remove states immediately if it is ever clear that they cannot match.
2974         if (!node_does_match) {
2975           if (!later_sibling_can_match) {
2976             LOG(
2977               "  discard state. pattern:%u, step:%u\n",
2978               state->pattern_index,
2979               state->step_index
2980             );
2981             capture_list_pool_release(
2982               &self->capture_list_pool,
2983               state->capture_list_id
2984             );
2985             array_erase(&self->states, i);
2986             i--;
2987           }
2988           continue;
2989         }
2990 
2991         // Some patterns can match their root node in multiple ways, capturing different
2992         // children. If this pattern step could match later children within the same
2993         // parent, then this query state cannot simply be updated in place. It must be
2994         // split into two states: one that matches this node, and one which skips over
2995         // this node, to preserve the possibility of matching later siblings.
2996         if (later_sibling_can_match && (
2997           step->contains_captures ||
2998           ts_query__step_is_fallible(self->query, state->step_index)
2999         )) {
3000           if (ts_query_cursor__copy_state(self, &state)) {
3001             LOG(
3002               "  split state for capture. pattern:%u, step:%u\n",
3003               state->pattern_index,
3004               state->step_index
3005             );
3006             copy_count++;
3007           }
3008         }
3009 
3010         // If this pattern started with a wildcard, such that the pattern map
3011         // actually points to the *second* step of the pattern, then check
3012         // that the node has a parent, and capture the parent node if necessary.
3013         if (state->needs_parent) {
3014           TSNode parent = ts_tree_cursor_parent_node(&self->cursor);
3015           if (ts_node_is_null(parent)) {
3016             LOG("  missing parent node\n");
3017             state->dead = true;
3018           } else {
3019             state->needs_parent = false;
3020             QueryStep *skipped_wildcard_step = step;
3021             do {
3022               skipped_wildcard_step--;
3023             } while (
3024               skipped_wildcard_step->is_dead_end ||
3025               skipped_wildcard_step->is_pass_through ||
3026               skipped_wildcard_step->depth > 0
3027             );
3028             if (skipped_wildcard_step->capture_ids[0] != NONE) {
3029               LOG("  capture wildcard parent\n");
3030               ts_query_cursor__capture(
3031                 self,
3032                 state,
3033                 skipped_wildcard_step,
3034                 parent
3035               );
3036             }
3037           }
3038         }
3039 
3040         // If the current node is captured in this pattern, add it to the capture list.
3041         if (step->capture_ids[0] != NONE) {
3042           ts_query_cursor__capture(self, state, step, node);
3043         }
3044 
3045         if (state->dead) {
3046           array_erase(&self->states, i);
3047           i--;
3048           continue;
3049         }
3050 
3051         // Advance this state to the next step of its pattern.
3052         state->step_index++;
3053         state->seeking_immediate_match = false;
3054         LOG(
3055           "  advance state. pattern:%u, step:%u\n",
3056           state->pattern_index,
3057           state->step_index
3058         );
3059 
3060         QueryStep *next_step = &self->query->steps.contents[state->step_index];
3061         if (stop_on_definite_step && next_step->root_pattern_guaranteed) did_match = true;
3062 
3063         // If this state's next step has an alternative step, then copy the state in order
3064         // to pursue both alternatives. The alternative step itself may have an alternative,
3065         // so this is an interative process.
3066         unsigned end_index = i + 1;
3067         for (unsigned j = i; j < end_index; j++) {
3068           QueryState *state = &self->states.contents[j];
3069           QueryStep *next_step = &self->query->steps.contents[state->step_index];
3070           if (next_step->alternative_index != NONE) {
3071             // A "dead-end" step exists only to add a non-sequential jump into the step sequence,
3072             // via its alternative index. When a state reaches a dead-end step, it jumps straight
3073             // to the step's alternative.
3074             if (next_step->is_dead_end) {
3075               state->step_index = next_step->alternative_index;
3076               j--;
3077               continue;
3078             }
3079 
3080             // A "pass-through" step exists only to add a branch into the step sequence,
3081             // via its alternative_index. When a state reaches a pass-through step, it splits
3082             // in order to process the alternative step, and then it advances to the next step.
3083             if (next_step->is_pass_through) {
3084               state->step_index++;
3085               j--;
3086             }
3087 
3088             QueryState *copy = ts_query_cursor__copy_state(self, &state);
3089             if (copy) {
3090               LOG(
3091                 "  split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n",
3092                 copy->pattern_index,
3093                 copy->step_index,
3094                 next_step->alternative_index,
3095                 next_step->alternative_is_immediate,
3096                 capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size
3097               );
3098               end_index++;
3099               copy_count++;
3100               copy->step_index = next_step->alternative_index;
3101               if (next_step->alternative_is_immediate) {
3102                 copy->seeking_immediate_match = true;
3103               }
3104             }
3105           }
3106         }
3107       }
3108 
3109       for (unsigned i = 0; i < self->states.size; i++) {
3110         QueryState *state = &self->states.contents[i];
3111         if (state->dead) {
3112           array_erase(&self->states, i);
3113           i--;
3114           continue;
3115         }
3116 
3117         // Enfore the longest-match criteria. When a query pattern contains optional or
3118         // repeated nodes, this is necessary to avoid multiple redundant states, where
3119         // one state has a strict subset of another state's captures.
3120         bool did_remove = false;
3121         for (unsigned j = i + 1; j < self->states.size; j++) {
3122           QueryState *other_state = &self->states.contents[j];
3123 
3124           // Query states are kept in ascending order of start_depth and pattern_index.
3125           // Since the longest-match criteria is only used for deduping matches of the same
3126           // pattern and root node, we only need to perform pairwise comparisons within a
3127           // small slice of the states array.
3128           if (
3129             other_state->start_depth != state->start_depth ||
3130             other_state->pattern_index != state->pattern_index
3131           ) break;
3132 
3133           bool left_contains_right, right_contains_left;
3134           ts_query_cursor__compare_captures(
3135             self,
3136             state,
3137             other_state,
3138             &left_contains_right,
3139             &right_contains_left
3140           );
3141           if (left_contains_right) {
3142             if (state->step_index == other_state->step_index) {
3143               LOG(
3144                 "  drop shorter state. pattern: %u, step_index: %u\n",
3145                 state->pattern_index,
3146                 state->step_index
3147               );
3148               capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id);
3149               array_erase(&self->states, j);
3150               j--;
3151               continue;
3152             }
3153             other_state->has_in_progress_alternatives = true;
3154           }
3155           if (right_contains_left) {
3156             if (state->step_index == other_state->step_index) {
3157               LOG(
3158                 "  drop shorter state. pattern: %u, step_index: %u\n",
3159                 state->pattern_index,
3160                 state->step_index
3161               );
3162               capture_list_pool_release(&self->capture_list_pool, state->capture_list_id);
3163               array_erase(&self->states, i);
3164               i--;
3165               did_remove = true;
3166               break;
3167             }
3168             state->has_in_progress_alternatives = true;
3169           }
3170         }
3171 
3172         // If the state is at the end of its pattern, remove it from the list
3173         // of in-progress states and add it to the list of finished states.
3174         if (!did_remove) {
3175           LOG(
3176             "  keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n",
3177             state->pattern_index,
3178             state->start_depth,
3179             state->step_index,
3180             capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size
3181           );
3182           QueryStep *next_step = &self->query->steps.contents[state->step_index];
3183           if (next_step->depth == PATTERN_DONE_MARKER) {
3184             if (state->has_in_progress_alternatives) {
3185               LOG("  defer finishing pattern %u\n", state->pattern_index);
3186             } else {
3187               LOG("  finish pattern %u\n", state->pattern_index);
3188               array_push(&self->finished_states, *state);
3189               array_erase(&self->states, state - self->states.contents);
3190               did_match = true;
3191               i--;
3192             }
3193           }
3194         }
3195       }
3196 
3197       // When the current node ends prior to the desired start offset,
3198       // only descend for the purpose of continuing in-progress matches.
3199       bool should_descend = node_intersects_range;
3200       if (!should_descend) {
3201         for (unsigned i = 0; i < self->states.size; i++) {
3202           QueryState *state = &self->states.contents[i];;
3203           QueryStep *next_step = &self->query->steps.contents[state->step_index];
3204           if (
3205             next_step->depth != PATTERN_DONE_MARKER &&
3206             state->start_depth + next_step->depth > self->depth
3207           ) {
3208             should_descend = true;
3209             break;
3210           }
3211         }
3212       }
3213 
3214       if (!should_descend) {
3215         LOG(
3216           "  not descending. node end byte: %u, start byte: %u\n",
3217           ts_node_end_byte(node),
3218           self->start_byte
3219         );
3220       }
3221 
3222       if (should_descend && ts_tree_cursor_goto_first_child(&self->cursor)) {
3223         self->depth++;
3224       } else {
3225         self->ascending = true;
3226       }
3227     }
3228   }
3229 }
3230 
ts_query_cursor_next_match(TSQueryCursor * self,TSQueryMatch * match)3231 bool ts_query_cursor_next_match(
3232   TSQueryCursor *self,
3233   TSQueryMatch *match
3234 ) {
3235   if (self->finished_states.size == 0) {
3236     if (!ts_query_cursor__advance(self, false)) {
3237       return false;
3238     }
3239   }
3240 
3241   QueryState *state = &self->finished_states.contents[0];
3242   if (state->id == UINT32_MAX) state->id = self->next_state_id++;
3243   match->id = state->id;
3244   match->pattern_index = state->pattern_index;
3245   const CaptureList *captures = capture_list_pool_get(
3246     &self->capture_list_pool,
3247     state->capture_list_id
3248   );
3249   match->captures = captures->contents;
3250   match->capture_count = captures->size;
3251   capture_list_pool_release(&self->capture_list_pool, state->capture_list_id);
3252   array_erase(&self->finished_states, 0);
3253   return true;
3254 }
3255 
ts_query_cursor_remove_match(TSQueryCursor * self,uint32_t match_id)3256 void ts_query_cursor_remove_match(
3257   TSQueryCursor *self,
3258   uint32_t match_id
3259 ) {
3260   for (unsigned i = 0; i < self->finished_states.size; i++) {
3261     const QueryState *state = &self->finished_states.contents[i];
3262     if (state->id == match_id) {
3263       capture_list_pool_release(
3264         &self->capture_list_pool,
3265         state->capture_list_id
3266       );
3267       array_erase(&self->finished_states, i);
3268       return;
3269     }
3270   }
3271 }
3272 
ts_query_cursor_next_capture(TSQueryCursor * self,TSQueryMatch * match,uint32_t * capture_index)3273 bool ts_query_cursor_next_capture(
3274   TSQueryCursor *self,
3275   TSQueryMatch *match,
3276   uint32_t *capture_index
3277 ) {
3278   // The goal here is to return captures in order, even though they may not
3279   // be discovered in order, because patterns can overlap. Search for matches
3280   // until there is a finished capture that is before any unfinished capture.
3281   for (;;) {
3282     // First, find the earliest capture in an unfinished match.
3283     uint32_t first_unfinished_capture_byte;
3284     uint32_t first_unfinished_pattern_index;
3285     uint32_t first_unfinished_state_index;
3286     bool first_unfinished_state_is_definite = false;
3287     ts_query_cursor__first_in_progress_capture(
3288       self,
3289       &first_unfinished_state_index,
3290       &first_unfinished_capture_byte,
3291       &first_unfinished_pattern_index,
3292       &first_unfinished_state_is_definite
3293     );
3294 
3295     // Then find the earliest capture in a finished match. It must occur
3296     // before the first capture in an *unfinished* match.
3297     QueryState *first_finished_state = NULL;
3298     uint32_t first_finished_capture_byte = first_unfinished_capture_byte;
3299     uint32_t first_finished_pattern_index = first_unfinished_pattern_index;
3300     for (unsigned i = 0; i < self->finished_states.size;) {
3301       QueryState *state = &self->finished_states.contents[i];
3302       const CaptureList *captures = capture_list_pool_get(
3303         &self->capture_list_pool,
3304         state->capture_list_id
3305       );
3306 
3307       // Remove states whose captures are all consumed.
3308       if (state->consumed_capture_count >= captures->size) {
3309         capture_list_pool_release(
3310           &self->capture_list_pool,
3311           state->capture_list_id
3312         );
3313         array_erase(&self->finished_states, i);
3314         continue;
3315       }
3316 
3317       // Skip captures that precede the cursor's start byte.
3318       TSNode node = captures->contents[state->consumed_capture_count].node;
3319       if (ts_node_end_byte(node) <= self->start_byte) {
3320         state->consumed_capture_count++;
3321         continue;
3322       }
3323 
3324       uint32_t node_start_byte = ts_node_start_byte(node);
3325       if (
3326         node_start_byte < first_finished_capture_byte ||
3327         (
3328           node_start_byte == first_finished_capture_byte &&
3329           state->pattern_index < first_finished_pattern_index
3330         )
3331       ) {
3332         first_finished_state = state;
3333         first_finished_capture_byte = node_start_byte;
3334         first_finished_pattern_index = state->pattern_index;
3335       }
3336       i++;
3337     }
3338 
3339     // If there is finished capture that is clearly before any unfinished
3340     // capture, then return its match, and its capture index. Internally
3341     // record the fact that the capture has been 'consumed'.
3342     QueryState *state;
3343     if (first_finished_state) {
3344       state = first_finished_state;
3345     } else if (first_unfinished_state_is_definite) {
3346       state = &self->states.contents[first_unfinished_state_index];
3347     } else {
3348       state = NULL;
3349     }
3350 
3351     if (state) {
3352       if (state->id == UINT32_MAX) state->id = self->next_state_id++;
3353       match->id = state->id;
3354       match->pattern_index = state->pattern_index;
3355       const CaptureList *captures = capture_list_pool_get(
3356         &self->capture_list_pool,
3357         state->capture_list_id
3358       );
3359       match->captures = captures->contents;
3360       match->capture_count = captures->size;
3361       *capture_index = state->consumed_capture_count;
3362       state->consumed_capture_count++;
3363       return true;
3364     }
3365 
3366     if (capture_list_pool_is_empty(&self->capture_list_pool)) {
3367       LOG(
3368         "  abandon state. index:%u, pattern:%u, offset:%u.\n",
3369         first_unfinished_state_index,
3370         first_unfinished_pattern_index,
3371         first_unfinished_capture_byte
3372       );
3373       capture_list_pool_release(
3374         &self->capture_list_pool,
3375         self->states.contents[first_unfinished_state_index].capture_list_id
3376       );
3377       array_erase(&self->states, first_unfinished_state_index);
3378     }
3379 
3380     // If there are no finished matches that are ready to be returned, then
3381     // continue finding more matches.
3382     if (
3383       !ts_query_cursor__advance(self, true) &&
3384       self->finished_states.size == 0
3385     ) return false;
3386   }
3387 }
3388 
3389 #undef LOG
3390