1 #include "tree_sitter/api.h"
2 #include "./alloc.h"
3 #include "./array.h"
4 #include "./language.h"
5 #include "./point.h"
6 #include "./tree_cursor.h"
7 #include "./unicode.h"
8 #include <wctype.h>
9
10 // #define DEBUG_ANALYZE_QUERY
11 // #define DEBUG_EXECUTE_QUERY
12
13 #define MAX_STEP_CAPTURE_COUNT 3
14 #define MAX_STATE_PREDECESSOR_COUNT 100
15 #define MAX_ANALYSIS_STATE_DEPTH 8
16 #define MAX_NEGATED_FIELD_COUNT 8
17
18 /*
19 * Stream - A sequence of unicode characters derived from a UTF8 string.
20 * This struct is used in parsing queries from S-expressions.
21 */
22 typedef struct {
23 const char *input;
24 const char *start;
25 const char *end;
26 int32_t next;
27 uint8_t next_size;
28 } Stream;
29
30 /*
31 * QueryStep - A step in the process of matching a query. Each node within
32 * a query S-expression corresponds to one of these steps. An entire pattern
33 * is represented as a sequence of these steps. The basic properties of a
34 * node are represented by these fields:
35 * - `symbol` - The grammar symbol to match. A zero value represents the
36 * wildcard symbol, '_'.
37 * - `field` - The field name to match. A zero value means that a field name
38 * was not specified.
39 * - `capture_ids` - An array of integers representing the names of captures
40 * associated with this node in the pattern, terminated by a `NONE` value.
41 * - `depth` - The depth where this node occurs in the pattern. The root node
42 * of the pattern has depth zero.
43 * - `negated_field_list_id` - An id representing a set of fields that must
44 * that must not be present on a node matching this step.
45 *
46 * Steps have some additional fields in order to handle the `.` (or "anchor") operator,
47 * which forbids additional child nodes:
48 * - `is_immediate` - Indicates that the node matching this step cannot be preceded
49 * by other sibling nodes that weren't specified in the pattern.
50 * - `is_last_child` - Indicates that the node matching this step cannot have any
51 * subsequent named siblings.
52 *
53 * For simple patterns, steps are matched in sequential order. But in order to
54 * handle alternative/repeated/optional sub-patterns, query steps are not always
55 * structured as a linear sequence; they sometimes need to split and merge. This
56 * is done using the following fields:
57 * - `alternative_index` - The index of a different query step that serves as
58 * an alternative to this step. A `NONE` value represents no alternative.
59 * When a query state reaches a step with an alternative index, the state
60 * is duplicated, with one copy remaining at the original step, and one copy
61 * moving to the alternative step. The alternative may have its own alternative
62 * step, so this splitting is an iterative process.
63 * - `is_dead_end` - Indicates that this state cannot be passed directly, and
64 * exists only in order to redirect to an alternative index, with no splitting.
65 * - `is_pass_through` - Indicates that state has no matching logic of its own,
66 * and exists only to split a state. One copy of the state advances immediately
67 * to the next step, and one moves to the alternative step.
68 * - `alternative_is_immediate` - Indicates that this step's alternative step
69 * should be treated as if `is_immediate` is true.
70 *
71 * Steps also store some derived state that summarizes how they relate to other
72 * steps within the same pattern. This is used to optimize the matching process:
73 * - `contains_captures` - Indicates that this step or one of its child steps
74 * has a non-empty `capture_ids` list.
75 * - `parent_pattern_guaranteed` - Indicates that if this step is reached, then
76 * it and all of its subsequent sibling steps within the same parent pattern
77 * are guaranteed to match.
78 * - `root_pattern_guaranteed` - Similar to `parent_pattern_guaranteed`, but
79 * for the entire top-level pattern. When iterating through a query's
80 * captures using `ts_query_cursor_next_capture`, this field is used to
81 * detect that a capture can safely be returned from a match that has not
82 * even completed yet.
83 */
84 typedef struct {
85 TSSymbol symbol;
86 TSSymbol supertype_symbol;
87 TSFieldId field;
88 uint16_t capture_ids[MAX_STEP_CAPTURE_COUNT];
89 uint16_t depth;
90 uint16_t alternative_index;
91 uint16_t negated_field_list_id;
92 bool is_named: 1;
93 bool is_immediate: 1;
94 bool is_last_child: 1;
95 bool is_pass_through: 1;
96 bool is_dead_end: 1;
97 bool alternative_is_immediate: 1;
98 bool contains_captures: 1;
99 bool root_pattern_guaranteed: 1;
100 bool parent_pattern_guaranteed: 1;
101 } QueryStep;
102
103 /*
104 * Slice - A slice of an external array. Within a query, capture names,
105 * literal string values, and predicate step informations are stored in three
106 * contiguous arrays. Individual captures, string values, and predicates are
107 * represented as slices of these three arrays.
108 */
109 typedef struct {
110 uint32_t offset;
111 uint32_t length;
112 } Slice;
113
114 /*
115 * SymbolTable - a two-way mapping of strings to ids.
116 */
117 typedef struct {
118 Array(char) characters;
119 Array(Slice) slices;
120 } SymbolTable;
121
122 /*
123 * PatternEntry - Information about the starting point for matching a particular
124 * pattern. These entries are stored in a 'pattern map' - a sorted array that
125 * makes it possible to efficiently lookup patterns based on the symbol for their
126 * first step. The entry consists of the following fields:
127 * - `pattern_index` - the index of the pattern within the query
128 * - `step_index` - the index of the pattern's first step in the shared `steps` array
129 * - `is_rooted` - whether or not the pattern has a single root node. This property
130 * affects decisions about whether or not to start the pattern for nodes outside
131 * of a QueryCursor's range restriction.
132 */
133 typedef struct {
134 uint16_t step_index;
135 uint16_t pattern_index;
136 bool is_rooted;
137 } PatternEntry;
138
139 typedef struct {
140 Slice steps;
141 Slice predicate_steps;
142 uint32_t start_byte;
143 } QueryPattern;
144
145 typedef struct {
146 uint32_t byte_offset;
147 uint16_t step_index;
148 } StepOffset;
149
150 /*
151 * QueryState - The state of an in-progress match of a particular pattern
152 * in a query. While executing, a `TSQueryCursor` must keep track of a number
153 * of possible in-progress matches. Each of those possible matches is
154 * represented as one of these states. Fields:
155 * - `id` - A numeric id that is exposed to the public API. This allows the
156 * caller to remove a given match, preventing any more of its captures
157 * from being returned.
158 * - `start_depth` - The depth in the tree where the first step of the state's
159 * pattern was matched.
160 * - `pattern_index` - The pattern that the state is matching.
161 * - `consumed_capture_count` - The number of captures from this match that
162 * have already been returned.
163 * - `capture_list_id` - A numeric id that can be used to retrieve the state's
164 * list of captures from the `CaptureListPool`.
165 * - `seeking_immediate_match` - A flag that indicates that the state's next
166 * step must be matched by the very next sibling. This is used when
167 * processing repetitions.
168 * - `has_in_progress_alternatives` - A flag that indicates that there is are
169 * other states that have the same captures as this state, but are at
170 * different steps in their pattern. This means that in order to obey the
171 * 'longest-match' rule, this state should not be returned as a match until
172 * it is clear that there can be no other alternative match with more captures.
173 */
174 typedef struct {
175 uint32_t id;
176 uint32_t capture_list_id;
177 uint16_t start_depth;
178 uint16_t step_index;
179 uint16_t pattern_index;
180 uint16_t consumed_capture_count: 12;
181 bool seeking_immediate_match: 1;
182 bool has_in_progress_alternatives: 1;
183 bool dead: 1;
184 bool needs_parent: 1;
185 } QueryState;
186
187 typedef Array(TSQueryCapture) CaptureList;
188
189 /*
190 * CaptureListPool - A collection of *lists* of captures. Each query state needs
191 * to maintain its own list of captures. To avoid repeated allocations, this struct
192 * maintains a fixed set of capture lists, and keeps track of which ones are
193 * currently in use by a query state.
194 */
195 typedef struct {
196 Array(CaptureList) list;
197 CaptureList empty_list;
198 // The maximum number of capture lists that we are allowed to allocate. We
199 // never allow `list` to allocate more entries than this, dropping pending
200 // matches if needed to stay under the limit.
201 uint32_t max_capture_list_count;
202 // The number of capture lists allocated in `list` that are not currently in
203 // use. We reuse those existing-but-unused capture lists before trying to
204 // allocate any new ones. We use an invalid value (UINT32_MAX) for a capture
205 // list's length to indicate that it's not in use.
206 uint32_t free_capture_list_count;
207 } CaptureListPool;
208
209 /*
210 * AnalysisState - The state needed for walking the parse table when analyzing
211 * a query pattern, to determine at which steps the pattern might fail to match.
212 */
213 typedef struct {
214 TSStateId parse_state;
215 TSSymbol parent_symbol;
216 uint16_t child_index;
217 TSFieldId field_id: 15;
218 bool done: 1;
219 } AnalysisStateEntry;
220
221 typedef struct {
222 AnalysisStateEntry stack[MAX_ANALYSIS_STATE_DEPTH];
223 uint16_t depth;
224 uint16_t step_index;
225 } AnalysisState;
226
227 typedef Array(AnalysisState) AnalysisStateSet;
228
229 /*
230 * AnalysisSubgraph - A subset of the states in the parse table that are used
231 * in constructing nodes with a certain symbol. Each state is accompanied by
232 * some information about the possible node that could be produced in
233 * downstream states.
234 */
235 typedef struct {
236 TSStateId state;
237 uint8_t production_id;
238 uint8_t child_index: 7;
239 bool done: 1;
240 } AnalysisSubgraphNode;
241
242 typedef struct {
243 TSSymbol symbol;
244 Array(TSStateId) start_states;
245 Array(AnalysisSubgraphNode) nodes;
246 } AnalysisSubgraph;
247
248 /*
249 * StatePredecessorMap - A map that stores the predecessors of each parse state.
250 * This is used during query analysis to determine which parse states can lead
251 * to which reduce actions.
252 */
253 typedef struct {
254 TSStateId *contents;
255 } StatePredecessorMap;
256
257 /*
258 * TSQuery - A tree query, compiled from a string of S-expressions. The query
259 * itself is immutable. The mutable state used in the process of executing the
260 * query is stored in a `TSQueryCursor`.
261 */
262 struct TSQuery {
263 SymbolTable captures;
264 SymbolTable predicate_values;
265 Array(QueryStep) steps;
266 Array(PatternEntry) pattern_map;
267 Array(TSQueryPredicateStep) predicate_steps;
268 Array(QueryPattern) patterns;
269 Array(StepOffset) step_offsets;
270 Array(TSFieldId) negated_fields;
271 Array(char) string_buffer;
272 const TSLanguage *language;
273 uint16_t wildcard_root_pattern_count;
274 };
275
276 /*
277 * TSQueryCursor - A stateful struct used to execute a query on a tree.
278 */
279 struct TSQueryCursor {
280 const TSQuery *query;
281 TSTreeCursor cursor;
282 Array(QueryState) states;
283 Array(QueryState) finished_states;
284 CaptureListPool capture_list_pool;
285 uint32_t depth;
286 uint32_t start_byte;
287 uint32_t end_byte;
288 TSPoint start_point;
289 TSPoint end_point;
290 uint32_t next_state_id;
291 bool ascending;
292 bool halted;
293 bool did_exceed_match_limit;
294 };
295
296 static const TSQueryError PARENT_DONE = -1;
297 static const uint16_t PATTERN_DONE_MARKER = UINT16_MAX;
298 static const uint16_t NONE = UINT16_MAX;
299 static const TSSymbol WILDCARD_SYMBOL = 0;
300
301 /**********
302 * Stream
303 **********/
304
305 // Advance to the next unicode code point in the stream.
stream_advance(Stream * self)306 static bool stream_advance(Stream *self) {
307 self->input += self->next_size;
308 if (self->input < self->end) {
309 uint32_t size = ts_decode_utf8(
310 (const uint8_t *)self->input,
311 self->end - self->input,
312 &self->next
313 );
314 if (size > 0) {
315 self->next_size = size;
316 return true;
317 }
318 } else {
319 self->next_size = 0;
320 self->next = '\0';
321 }
322 return false;
323 }
324
325 // Reset the stream to the given input position, represented as a pointer
326 // into the input string.
stream_reset(Stream * self,const char * input)327 static void stream_reset(Stream *self, const char *input) {
328 self->input = input;
329 self->next_size = 0;
330 stream_advance(self);
331 }
332
stream_new(const char * string,uint32_t length)333 static Stream stream_new(const char *string, uint32_t length) {
334 Stream self = {
335 .next = 0,
336 .input = string,
337 .start = string,
338 .end = string + length,
339 };
340 stream_advance(&self);
341 return self;
342 }
343
stream_skip_whitespace(Stream * self)344 static void stream_skip_whitespace(Stream *self) {
345 for (;;) {
346 if (iswspace(self->next)) {
347 stream_advance(self);
348 } else if (self->next == ';') {
349 // skip over comments
350 stream_advance(self);
351 while (self->next && self->next != '\n') {
352 if (!stream_advance(self)) break;
353 }
354 } else {
355 break;
356 }
357 }
358 }
359
stream_is_ident_start(Stream * self)360 static bool stream_is_ident_start(Stream *self) {
361 return iswalnum(self->next) || self->next == '_' || self->next == '-';
362 }
363
stream_scan_identifier(Stream * stream)364 static void stream_scan_identifier(Stream *stream) {
365 do {
366 stream_advance(stream);
367 } while (
368 iswalnum(stream->next) ||
369 stream->next == '_' ||
370 stream->next == '-' ||
371 stream->next == '.' ||
372 stream->next == '?' ||
373 stream->next == '!'
374 );
375 }
376
stream_offset(Stream * self)377 static uint32_t stream_offset(Stream *self) {
378 return self->input - self->start;
379 }
380
381 /******************
382 * CaptureListPool
383 ******************/
384
capture_list_pool_new(void)385 static CaptureListPool capture_list_pool_new(void) {
386 return (CaptureListPool) {
387 .list = array_new(),
388 .empty_list = array_new(),
389 .max_capture_list_count = UINT32_MAX,
390 .free_capture_list_count = 0,
391 };
392 }
393
capture_list_pool_reset(CaptureListPool * self)394 static void capture_list_pool_reset(CaptureListPool *self) {
395 for (uint16_t i = 0; i < self->list.size; i++) {
396 // This invalid size means that the list is not in use.
397 self->list.contents[i].size = UINT32_MAX;
398 }
399 self->free_capture_list_count = self->list.size;
400 }
401
capture_list_pool_delete(CaptureListPool * self)402 static void capture_list_pool_delete(CaptureListPool *self) {
403 for (uint16_t i = 0; i < self->list.size; i++) {
404 array_delete(&self->list.contents[i]);
405 }
406 array_delete(&self->list);
407 }
408
capture_list_pool_get(const CaptureListPool * self,uint16_t id)409 static const CaptureList *capture_list_pool_get(const CaptureListPool *self, uint16_t id) {
410 if (id >= self->list.size) return &self->empty_list;
411 return &self->list.contents[id];
412 }
413
capture_list_pool_get_mut(CaptureListPool * self,uint16_t id)414 static CaptureList *capture_list_pool_get_mut(CaptureListPool *self, uint16_t id) {
415 assert(id < self->list.size);
416 return &self->list.contents[id];
417 }
418
capture_list_pool_is_empty(const CaptureListPool * self)419 static bool capture_list_pool_is_empty(const CaptureListPool *self) {
420 // The capture list pool is empty if all allocated lists are in use, and we
421 // have reached the maximum allowed number of allocated lists.
422 return self->free_capture_list_count == 0 && self->list.size >= self->max_capture_list_count;
423 }
424
capture_list_pool_acquire(CaptureListPool * self)425 static uint16_t capture_list_pool_acquire(CaptureListPool *self) {
426 // First see if any already allocated capture list is currently unused.
427 if (self->free_capture_list_count > 0) {
428 for (uint16_t i = 0; i < self->list.size; i++) {
429 if (self->list.contents[i].size == UINT32_MAX) {
430 array_clear(&self->list.contents[i]);
431 self->free_capture_list_count--;
432 return i;
433 }
434 }
435 }
436
437 // Otherwise allocate and initialize a new capture list, as long as that
438 // doesn't put us over the requested maximum.
439 uint32_t i = self->list.size;
440 if (i >= self->max_capture_list_count) {
441 return NONE;
442 }
443 CaptureList list;
444 array_init(&list);
445 array_push(&self->list, list);
446 return i;
447 }
448
capture_list_pool_release(CaptureListPool * self,uint16_t id)449 static void capture_list_pool_release(CaptureListPool *self, uint16_t id) {
450 if (id >= self->list.size) return;
451 self->list.contents[id].size = UINT32_MAX;
452 self->free_capture_list_count++;
453 }
454
455 /**************
456 * SymbolTable
457 **************/
458
symbol_table_new(void)459 static SymbolTable symbol_table_new(void) {
460 return (SymbolTable) {
461 .characters = array_new(),
462 .slices = array_new(),
463 };
464 }
465
symbol_table_delete(SymbolTable * self)466 static void symbol_table_delete(SymbolTable *self) {
467 array_delete(&self->characters);
468 array_delete(&self->slices);
469 }
470
symbol_table_id_for_name(const SymbolTable * self,const char * name,uint32_t length)471 static int symbol_table_id_for_name(
472 const SymbolTable *self,
473 const char *name,
474 uint32_t length
475 ) {
476 for (unsigned i = 0; i < self->slices.size; i++) {
477 Slice slice = self->slices.contents[i];
478 if (
479 slice.length == length &&
480 !strncmp(&self->characters.contents[slice.offset], name, length)
481 ) return i;
482 }
483 return -1;
484 }
485
symbol_table_name_for_id(const SymbolTable * self,uint16_t id,uint32_t * length)486 static const char *symbol_table_name_for_id(
487 const SymbolTable *self,
488 uint16_t id,
489 uint32_t *length
490 ) {
491 Slice slice = self->slices.contents[id];
492 *length = slice.length;
493 return &self->characters.contents[slice.offset];
494 }
495
symbol_table_insert_name(SymbolTable * self,const char * name,uint32_t length)496 static uint16_t symbol_table_insert_name(
497 SymbolTable *self,
498 const char *name,
499 uint32_t length
500 ) {
501 int id = symbol_table_id_for_name(self, name, length);
502 if (id >= 0) return (uint16_t)id;
503 Slice slice = {
504 .offset = self->characters.size,
505 .length = length,
506 };
507 array_grow_by(&self->characters, length + 1);
508 memcpy(&self->characters.contents[slice.offset], name, length);
509 self->characters.contents[self->characters.size - 1] = 0;
510 array_push(&self->slices, slice);
511 return self->slices.size - 1;
512 }
513
514 /************
515 * QueryStep
516 ************/
517
query_step__new(TSSymbol symbol,uint16_t depth,bool is_immediate)518 static QueryStep query_step__new(
519 TSSymbol symbol,
520 uint16_t depth,
521 bool is_immediate
522 ) {
523 return (QueryStep) {
524 .symbol = symbol,
525 .depth = depth,
526 .field = 0,
527 .capture_ids = {NONE, NONE, NONE},
528 .alternative_index = NONE,
529 .negated_field_list_id = 0,
530 .contains_captures = false,
531 .is_last_child = false,
532 .is_named = false,
533 .is_pass_through = false,
534 .is_dead_end = false,
535 .root_pattern_guaranteed = false,
536 .is_immediate = is_immediate,
537 .alternative_is_immediate = false,
538 };
539 }
540
query_step__add_capture(QueryStep * self,uint16_t capture_id)541 static void query_step__add_capture(QueryStep *self, uint16_t capture_id) {
542 for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) {
543 if (self->capture_ids[i] == NONE) {
544 self->capture_ids[i] = capture_id;
545 break;
546 }
547 }
548 }
549
query_step__remove_capture(QueryStep * self,uint16_t capture_id)550 static void query_step__remove_capture(QueryStep *self, uint16_t capture_id) {
551 for (unsigned i = 0; i < MAX_STEP_CAPTURE_COUNT; i++) {
552 if (self->capture_ids[i] == capture_id) {
553 self->capture_ids[i] = NONE;
554 while (i + 1 < MAX_STEP_CAPTURE_COUNT) {
555 if (self->capture_ids[i + 1] == NONE) break;
556 self->capture_ids[i] = self->capture_ids[i + 1];
557 self->capture_ids[i + 1] = NONE;
558 i++;
559 }
560 break;
561 }
562 }
563 }
564
565 /**********************
566 * StatePredecessorMap
567 **********************/
568
state_predecessor_map_new(const TSLanguage * language)569 static inline StatePredecessorMap state_predecessor_map_new(
570 const TSLanguage *language
571 ) {
572 return (StatePredecessorMap) {
573 .contents = ts_calloc(
574 language->state_count * (MAX_STATE_PREDECESSOR_COUNT + 1),
575 sizeof(TSStateId)
576 ),
577 };
578 }
579
state_predecessor_map_delete(StatePredecessorMap * self)580 static inline void state_predecessor_map_delete(StatePredecessorMap *self) {
581 ts_free(self->contents);
582 }
583
state_predecessor_map_add(StatePredecessorMap * self,TSStateId state,TSStateId predecessor)584 static inline void state_predecessor_map_add(
585 StatePredecessorMap *self,
586 TSStateId state,
587 TSStateId predecessor
588 ) {
589 unsigned index = state * (MAX_STATE_PREDECESSOR_COUNT + 1);
590 TSStateId *count = &self->contents[index];
591 if (
592 *count == 0 ||
593 (*count < MAX_STATE_PREDECESSOR_COUNT && self->contents[index + *count] != predecessor)
594 ) {
595 (*count)++;
596 self->contents[index + *count] = predecessor;
597 }
598 }
599
state_predecessor_map_get(const StatePredecessorMap * self,TSStateId state,unsigned * count)600 static inline const TSStateId *state_predecessor_map_get(
601 const StatePredecessorMap *self,
602 TSStateId state,
603 unsigned *count
604 ) {
605 unsigned index = state * (MAX_STATE_PREDECESSOR_COUNT + 1);
606 *count = self->contents[index];
607 return &self->contents[index + 1];
608 }
609
610 /****************
611 * AnalysisState
612 ****************/
613
analysis_state__recursion_depth(const AnalysisState * self)614 static unsigned analysis_state__recursion_depth(const AnalysisState *self) {
615 unsigned result = 0;
616 for (unsigned i = 0; i < self->depth; i++) {
617 TSSymbol symbol = self->stack[i].parent_symbol;
618 for (unsigned j = 0; j < i; j++) {
619 if (self->stack[j].parent_symbol == symbol) {
620 result++;
621 break;
622 }
623 }
624 }
625 return result;
626 }
627
analysis_state__compare_position(const AnalysisState * self,const AnalysisState * other)628 static inline int analysis_state__compare_position(
629 const AnalysisState *self,
630 const AnalysisState *other
631 ) {
632 for (unsigned i = 0; i < self->depth; i++) {
633 if (i >= other->depth) return -1;
634 if (self->stack[i].child_index < other->stack[i].child_index) return -1;
635 if (self->stack[i].child_index > other->stack[i].child_index) return 1;
636 }
637 if (self->depth < other->depth) return 1;
638 return 0;
639 }
640
analysis_state__compare(const AnalysisState * self,const AnalysisState * other)641 static inline int analysis_state__compare(
642 const AnalysisState *self,
643 const AnalysisState *other
644 ) {
645 int result = analysis_state__compare_position(self, other);
646 if (result != 0) return result;
647 for (unsigned i = 0; i < self->depth; i++) {
648 if (self->stack[i].parent_symbol < other->stack[i].parent_symbol) return -1;
649 if (self->stack[i].parent_symbol > other->stack[i].parent_symbol) return 1;
650 if (self->stack[i].parse_state < other->stack[i].parse_state) return -1;
651 if (self->stack[i].parse_state > other->stack[i].parse_state) return 1;
652 if (self->stack[i].field_id < other->stack[i].field_id) return -1;
653 if (self->stack[i].field_id > other->stack[i].field_id) return 1;
654 }
655 if (self->step_index < other->step_index) return -1;
656 if (self->step_index > other->step_index) return 1;
657 return 0;
658 }
659
analysis_state__top(AnalysisState * self)660 static inline AnalysisStateEntry *analysis_state__top(AnalysisState *self) {
661 return &self->stack[self->depth - 1];
662 }
663
analysis_state__has_supertype(AnalysisState * self,TSSymbol symbol)664 static inline bool analysis_state__has_supertype(AnalysisState *self, TSSymbol symbol) {
665 for (unsigned i = 0; i < self->depth; i++) {
666 if (self->stack[i].parent_symbol == symbol) return true;
667 }
668 return false;
669 }
670
671 /***********************
672 * AnalysisSubgraphNode
673 ***********************/
674
analysis_subgraph_node__compare(const AnalysisSubgraphNode * self,const AnalysisSubgraphNode * other)675 static inline int analysis_subgraph_node__compare(const AnalysisSubgraphNode *self, const AnalysisSubgraphNode *other) {
676 if (self->state < other->state) return -1;
677 if (self->state > other->state) return 1;
678 if (self->child_index < other->child_index) return -1;
679 if (self->child_index > other->child_index) return 1;
680 if (self->done < other->done) return -1;
681 if (self->done > other->done) return 1;
682 if (self->production_id < other->production_id) return -1;
683 if (self->production_id > other->production_id) return 1;
684 return 0;
685 }
686
687 /*********
688 * Query
689 *********/
690
691 // The `pattern_map` contains a mapping from TSSymbol values to indices in the
692 // `steps` array. For a given syntax node, the `pattern_map` makes it possible
693 // to quickly find the starting steps of all of the patterns whose root matches
694 // that node. Each entry has two fields: a `pattern_index`, which identifies one
695 // of the patterns in the query, and a `step_index`, which indicates the start
696 // offset of that pattern's steps within the `steps` array.
697 //
698 // The entries are sorted by the patterns' root symbols, and lookups use a
699 // binary search. This ensures that the cost of this initial lookup step
700 // scales logarithmically with the number of patterns in the query.
701 //
702 // This returns `true` if the symbol is present and `false` otherwise.
703 // If the symbol is not present `*result` is set to the index where the
704 // symbol should be inserted.
ts_query__pattern_map_search(const TSQuery * self,TSSymbol needle,uint32_t * result)705 static inline bool ts_query__pattern_map_search(
706 const TSQuery *self,
707 TSSymbol needle,
708 uint32_t *result
709 ) {
710 uint32_t base_index = self->wildcard_root_pattern_count;
711 uint32_t size = self->pattern_map.size - base_index;
712 if (size == 0) {
713 *result = base_index;
714 return false;
715 }
716 while (size > 1) {
717 uint32_t half_size = size / 2;
718 uint32_t mid_index = base_index + half_size;
719 TSSymbol mid_symbol = self->steps.contents[
720 self->pattern_map.contents[mid_index].step_index
721 ].symbol;
722 if (needle > mid_symbol) base_index = mid_index;
723 size -= half_size;
724 }
725
726 TSSymbol symbol = self->steps.contents[
727 self->pattern_map.contents[base_index].step_index
728 ].symbol;
729
730 if (needle > symbol) {
731 base_index++;
732 if (base_index < self->pattern_map.size) {
733 symbol = self->steps.contents[
734 self->pattern_map.contents[base_index].step_index
735 ].symbol;
736 }
737 }
738
739 *result = base_index;
740 return needle == symbol;
741 }
742
743 // Insert a new pattern's start index into the pattern map, maintaining
744 // the pattern map's ordering invariant.
ts_query__pattern_map_insert(TSQuery * self,TSSymbol symbol,PatternEntry new_entry)745 static inline void ts_query__pattern_map_insert(
746 TSQuery *self,
747 TSSymbol symbol,
748 PatternEntry new_entry
749 ) {
750 uint32_t index;
751 ts_query__pattern_map_search(self, symbol, &index);
752
753 // Ensure that the entries are sorted not only by symbol, but also
754 // by pattern_index. This way, states for earlier patterns will be
755 // initiated first, which allows the ordering of the states array
756 // to be maintained more efficiently.
757 while (index < self->pattern_map.size) {
758 PatternEntry *entry = &self->pattern_map.contents[index];
759 if (
760 self->steps.contents[entry->step_index].symbol == symbol &&
761 entry->pattern_index < new_entry.pattern_index
762 ) {
763 index++;
764 } else {
765 break;
766 }
767 }
768
769 array_insert(&self->pattern_map, index, new_entry);
770 }
771
ts_query__analyze_patterns(TSQuery * self,unsigned * error_offset)772 static bool ts_query__analyze_patterns(TSQuery *self, unsigned *error_offset) {
773 // Walk forward through all of the steps in the query, computing some
774 // basic information about each step. Mark all of the steps that contain
775 // captures, and record the indices of all of the steps that have child steps.
776 Array(uint32_t) parent_step_indices = array_new();
777 for (unsigned i = 0; i < self->steps.size; i++) {
778 QueryStep *step = &self->steps.contents[i];
779 if (step->depth == PATTERN_DONE_MARKER) {
780 step->parent_pattern_guaranteed = true;
781 step->root_pattern_guaranteed = true;
782 continue;
783 }
784
785 bool has_children = false;
786 bool is_wildcard = step->symbol == WILDCARD_SYMBOL;
787 step->contains_captures = step->capture_ids[0] != NONE;
788 for (unsigned j = i + 1; j < self->steps.size; j++) {
789 QueryStep *next_step = &self->steps.contents[j];
790 if (
791 next_step->depth == PATTERN_DONE_MARKER ||
792 next_step->depth <= step->depth
793 ) break;
794 if (next_step->capture_ids[0] != NONE) {
795 step->contains_captures = true;
796 }
797 if (!is_wildcard) {
798 next_step->root_pattern_guaranteed = true;
799 next_step->parent_pattern_guaranteed = true;
800 }
801 has_children = true;
802 }
803
804 if (has_children && !is_wildcard) {
805 array_push(&parent_step_indices, i);
806 }
807 }
808
809 // For every parent symbol in the query, initialize an 'analysis subgraph'.
810 // This subgraph lists all of the states in the parse table that are directly
811 // involved in building subtrees for this symbol.
812 //
813 // In addition to the parent symbols in the query, construct subgraphs for all
814 // of the hidden symbols in the grammar, because these might occur within
815 // one of the parent nodes, such that their children appear to belong to the
816 // parent.
817 Array(AnalysisSubgraph) subgraphs = array_new();
818 for (unsigned i = 0; i < parent_step_indices.size; i++) {
819 uint32_t parent_step_index = parent_step_indices.contents[i];
820 TSSymbol parent_symbol = self->steps.contents[parent_step_index].symbol;
821 AnalysisSubgraph subgraph = { .symbol = parent_symbol };
822 array_insert_sorted_by(&subgraphs, .symbol, subgraph);
823 }
824 for (TSSymbol sym = self->language->token_count; sym < self->language->symbol_count; sym++) {
825 if (!ts_language_symbol_metadata(self->language, sym).visible) {
826 AnalysisSubgraph subgraph = { .symbol = sym };
827 array_insert_sorted_by(&subgraphs, .symbol, subgraph);
828 }
829 }
830
831 // Scan the parse table to find the data needed to populate these subgraphs.
832 // Collect three things during this scan:
833 // 1) All of the parse states where one of these symbols can start.
834 // 2) All of the parse states where one of these symbols can end, along
835 // with information about the node that would be created.
836 // 3) A list of predecessor states for each state.
837 StatePredecessorMap predecessor_map = state_predecessor_map_new(self->language);
838 for (TSStateId state = 1; state < self->language->state_count; state++) {
839 unsigned subgraph_index, exists;
840 LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, state);
841 while (ts_lookahead_iterator_next(&lookahead_iterator)) {
842 if (lookahead_iterator.action_count) {
843 for (unsigned i = 0; i < lookahead_iterator.action_count; i++) {
844 const TSParseAction *action = &lookahead_iterator.actions[i];
845 if (action->type == TSParseActionTypeReduce) {
846 const TSSymbol *aliases, *aliases_end;
847 ts_language_aliases_for_symbol(
848 self->language,
849 action->reduce.symbol,
850 &aliases,
851 &aliases_end
852 );
853 for (const TSSymbol *symbol = aliases; symbol < aliases_end; symbol++) {
854 array_search_sorted_by(
855 &subgraphs,
856 .symbol,
857 *symbol,
858 &subgraph_index,
859 &exists
860 );
861 if (exists) {
862 AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
863 if (subgraph->nodes.size == 0 || array_back(&subgraph->nodes)->state != state) {
864 array_push(&subgraph->nodes, ((AnalysisSubgraphNode) {
865 .state = state,
866 .production_id = action->reduce.production_id,
867 .child_index = action->reduce.child_count,
868 .done = true,
869 }));
870 }
871 }
872 }
873 } else if (action->type == TSParseActionTypeShift && !action->shift.extra) {
874 TSStateId next_state = action->shift.state;
875 state_predecessor_map_add(&predecessor_map, next_state, state);
876 }
877 }
878 } else if (lookahead_iterator.next_state != 0) {
879 if (lookahead_iterator.next_state != state) {
880 state_predecessor_map_add(&predecessor_map, lookahead_iterator.next_state, state);
881 }
882 const TSSymbol *aliases, *aliases_end;
883 ts_language_aliases_for_symbol(
884 self->language,
885 lookahead_iterator.symbol,
886 &aliases,
887 &aliases_end
888 );
889 for (const TSSymbol *symbol = aliases; symbol < aliases_end; symbol++) {
890 array_search_sorted_by(
891 &subgraphs,
892 .symbol,
893 *symbol,
894 &subgraph_index,
895 &exists
896 );
897 if (exists) {
898 AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
899 if (
900 subgraph->start_states.size == 0 ||
901 *array_back(&subgraph->start_states) != state
902 )
903 array_push(&subgraph->start_states, state);
904 }
905 }
906 }
907 }
908 }
909
910 // For each subgraph, compute the preceding states by walking backward
911 // from the end states using the predecessor map.
912 Array(AnalysisSubgraphNode) next_nodes = array_new();
913 for (unsigned i = 0; i < subgraphs.size; i++) {
914 AnalysisSubgraph *subgraph = &subgraphs.contents[i];
915 if (subgraph->nodes.size == 0) {
916 array_delete(&subgraph->start_states);
917 array_erase(&subgraphs, i);
918 i--;
919 continue;
920 }
921 array_assign(&next_nodes, &subgraph->nodes);
922 while (next_nodes.size > 0) {
923 AnalysisSubgraphNode node = array_pop(&next_nodes);
924 if (node.child_index > 1) {
925 unsigned predecessor_count;
926 const TSStateId *predecessors = state_predecessor_map_get(
927 &predecessor_map,
928 node.state,
929 &predecessor_count
930 );
931 for (unsigned j = 0; j < predecessor_count; j++) {
932 AnalysisSubgraphNode predecessor_node = {
933 .state = predecessors[j],
934 .child_index = node.child_index - 1,
935 .production_id = node.production_id,
936 .done = false,
937 };
938 unsigned index, exists;
939 array_search_sorted_with(
940 &subgraph->nodes, analysis_subgraph_node__compare, &predecessor_node,
941 &index, &exists
942 );
943 if (!exists) {
944 array_insert(&subgraph->nodes, index, predecessor_node);
945 array_push(&next_nodes, predecessor_node);
946 }
947 }
948 }
949 }
950 }
951
952 #ifdef DEBUG_ANALYZE_QUERY
953 printf("\nSubgraphs:\n");
954 for (unsigned i = 0; i < subgraphs.size; i++) {
955 AnalysisSubgraph *subgraph = &subgraphs.contents[i];
956 printf(" %u, %s:\n", subgraph->symbol, ts_language_symbol_name(self->language, subgraph->symbol));
957 for (unsigned j = 0; j < subgraph->start_states.size; j++) {
958 printf(
959 " {state: %u}\n",
960 subgraph->start_states.contents[j]
961 );
962 }
963 for (unsigned j = 0; j < subgraph->nodes.size; j++) {
964 AnalysisSubgraphNode *node = &subgraph->nodes.contents[j];
965 printf(
966 " {state: %u, child_index: %u, production_id: %u, done: %d}\n",
967 node->state, node->child_index, node->production_id, node->done
968 );
969 }
970 printf("\n");
971 }
972 #endif
973
974 // For each non-terminal pattern, determine if the pattern can successfully match,
975 // and identify all of the possible children within the pattern where matching could fail.
976 bool all_patterns_are_valid = true;
977 AnalysisStateSet states = array_new();
978 AnalysisStateSet next_states = array_new();
979 AnalysisStateSet deeper_states = array_new();
980 Array(uint16_t) final_step_indices = array_new();
981 for (unsigned i = 0; i < parent_step_indices.size; i++) {
982 uint16_t parent_step_index = parent_step_indices.contents[i];
983 uint16_t parent_depth = self->steps.contents[parent_step_index].depth;
984 TSSymbol parent_symbol = self->steps.contents[parent_step_index].symbol;
985 if (parent_symbol == ts_builtin_sym_error) continue;
986
987 // Find the subgraph that corresponds to this pattern's root symbol. If the pattern's
988 // root symbol is a terminal, then return an error.
989 unsigned subgraph_index, exists;
990 array_search_sorted_by(&subgraphs, .symbol, parent_symbol, &subgraph_index, &exists);
991 if (!exists) {
992 unsigned first_child_step_index = parent_step_index + 1;
993 uint32_t i, exists;
994 array_search_sorted_by(&self->step_offsets, .step_index, first_child_step_index, &i, &exists);
995 assert(exists);
996 *error_offset = self->step_offsets.contents[i].byte_offset;
997 all_patterns_are_valid = false;
998 break;
999 }
1000
1001 // Initialize an analysis state at every parse state in the table where
1002 // this parent symbol can occur.
1003 AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
1004 array_clear(&states);
1005 array_clear(&deeper_states);
1006 for (unsigned j = 0; j < subgraph->start_states.size; j++) {
1007 TSStateId parse_state = subgraph->start_states.contents[j];
1008 array_push(&states, ((AnalysisState) {
1009 .step_index = parent_step_index + 1,
1010 .stack = {
1011 [0] = {
1012 .parse_state = parse_state,
1013 .parent_symbol = parent_symbol,
1014 .child_index = 0,
1015 .field_id = 0,
1016 .done = false,
1017 },
1018 },
1019 .depth = 1,
1020 }));
1021 }
1022
1023 // Walk the subgraph for this non-terminal, tracking all of the possible
1024 // sequences of progress within the pattern.
1025 bool can_finish_pattern = false;
1026 bool did_exceed_max_depth = false;
1027 unsigned recursion_depth_limit = 0;
1028 unsigned prev_final_step_count = 0;
1029 array_clear(&final_step_indices);
1030 for (;;) {
1031 #ifdef DEBUG_ANALYZE_QUERY
1032 printf("Final step indices:");
1033 for (unsigned j = 0; j < final_step_indices.size; j++) {
1034 printf(" %4u", final_step_indices.contents[j]);
1035 }
1036 printf("\nWalk states for %u %s:\n", i, ts_language_symbol_name(self->language, parent_symbol));
1037 for (unsigned j = 0; j < states.size; j++) {
1038 AnalysisState *state = &states.contents[j];
1039 printf(" %3u: step: %u, stack: [", j, state->step_index);
1040 for (unsigned k = 0; k < state->depth; k++) {
1041 printf(
1042 " {%s, child: %u, state: %4u",
1043 self->language->symbol_names[state->stack[k].parent_symbol],
1044 state->stack[k].child_index,
1045 state->stack[k].parse_state
1046 );
1047 if (state->stack[k].field_id) printf(", field: %s", self->language->field_names[state->stack[k].field_id]);
1048 if (state->stack[k].done) printf(", DONE");
1049 printf("}");
1050 }
1051 printf(" ]\n");
1052 }
1053 #endif
1054
1055 // If no further progress can be made within the current recursion depth limit, then
1056 // bump the depth limit by one, and continue to process the states the exceeded the
1057 // limit. But only allow this if progress has been made since the last time the depth
1058 // limit was increased.
1059 if (states.size == 0) {
1060 if (deeper_states.size > 0 && final_step_indices.size > prev_final_step_count) {
1061 #ifdef DEBUG_ANALYZE_QUERY
1062 printf("Increase recursion depth limit to %u\n", recursion_depth_limit + 1);
1063 #endif
1064
1065 prev_final_step_count = final_step_indices.size;
1066 recursion_depth_limit++;
1067 AnalysisStateSet _states = states;
1068 states = deeper_states;
1069 deeper_states = _states;
1070 continue;
1071 }
1072
1073 break;
1074 }
1075
1076 array_clear(&next_states);
1077 for (unsigned j = 0; j < states.size; j++) {
1078 AnalysisState * const state = &states.contents[j];
1079
1080 // For efficiency, it's important to avoid processing the same analysis state more
1081 // than once. To achieve this, keep the states in order of ascending position within
1082 // their hypothetical syntax trees. In each iteration of this loop, start by advancing
1083 // the states that have made the least progress. Avoid advancing states that have already
1084 // made more progress.
1085 if (next_states.size > 0) {
1086 int comparison = analysis_state__compare_position(state, array_back(&next_states));
1087 if (comparison == 0) {
1088 array_insert_sorted_with(&next_states, analysis_state__compare, *state);
1089 continue;
1090 } else if (comparison > 0) {
1091 while (j < states.size) {
1092 array_push(&next_states, states.contents[j]);
1093 j++;
1094 }
1095 break;
1096 }
1097 }
1098
1099 const TSStateId parse_state = analysis_state__top(state)->parse_state;
1100 const TSSymbol parent_symbol = analysis_state__top(state)->parent_symbol;
1101 const TSFieldId parent_field_id = analysis_state__top(state)->field_id;
1102 const unsigned child_index = analysis_state__top(state)->child_index;
1103 const QueryStep * const step = &self->steps.contents[state->step_index];
1104
1105 unsigned subgraph_index, exists;
1106 array_search_sorted_by(&subgraphs, .symbol, parent_symbol, &subgraph_index, &exists);
1107 if (!exists) continue;
1108 const AnalysisSubgraph *subgraph = &subgraphs.contents[subgraph_index];
1109
1110 // Follow every possible path in the parse table, but only visit states that
1111 // are part of the subgraph for the current symbol.
1112 LookaheadIterator lookahead_iterator = ts_language_lookaheads(self->language, parse_state);
1113 while (ts_lookahead_iterator_next(&lookahead_iterator)) {
1114 TSSymbol sym = lookahead_iterator.symbol;
1115
1116 AnalysisSubgraphNode successor = {
1117 .state = parse_state,
1118 .child_index = child_index,
1119 };
1120 if (lookahead_iterator.action_count) {
1121 const TSParseAction *action = &lookahead_iterator.actions[lookahead_iterator.action_count - 1];
1122 if (action->type == TSParseActionTypeShift) {
1123 if (!action->shift.extra) {
1124 successor.state = action->shift.state;
1125 successor.child_index++;
1126 }
1127 } else {
1128 continue;
1129 }
1130 } else if (lookahead_iterator.next_state != 0) {
1131 successor.state = lookahead_iterator.next_state;
1132 successor.child_index++;
1133 } else {
1134 continue;
1135 }
1136
1137 unsigned node_index;
1138 array_search_sorted_with(
1139 &subgraph->nodes,
1140 analysis_subgraph_node__compare, &successor,
1141 &node_index, &exists
1142 );
1143 while (node_index < subgraph->nodes.size) {
1144 AnalysisSubgraphNode *node = &subgraph->nodes.contents[node_index++];
1145 if (node->state != successor.state || node->child_index != successor.child_index) break;
1146
1147 // Use the subgraph to determine what alias and field will eventually be applied
1148 // to this child node.
1149 TSSymbol alias = ts_language_alias_at(self->language, node->production_id, child_index);
1150 TSSymbol visible_symbol = alias
1151 ? alias
1152 : self->language->symbol_metadata[sym].visible
1153 ? self->language->public_symbol_map[sym]
1154 : 0;
1155 TSFieldId field_id = parent_field_id;
1156 if (!field_id) {
1157 const TSFieldMapEntry *field_map, *field_map_end;
1158 ts_language_field_map(self->language, node->production_id, &field_map, &field_map_end);
1159 for (; field_map != field_map_end; field_map++) {
1160 if (!field_map->inherited && field_map->child_index == child_index) {
1161 field_id = field_map->field_id;
1162 break;
1163 }
1164 }
1165 }
1166
1167 // Create a new state that has advanced past this hypothetical subtree.
1168 AnalysisState next_state = *state;
1169 AnalysisStateEntry *next_state_top = analysis_state__top(&next_state);
1170 next_state_top->child_index = successor.child_index;
1171 next_state_top->parse_state = successor.state;
1172 if (node->done) next_state_top->done = true;
1173
1174 // Determine if this hypothetical child node would match the current step
1175 // of the query pattern.
1176 bool does_match = false;
1177 if (visible_symbol) {
1178 does_match = true;
1179 if (step->symbol == WILDCARD_SYMBOL) {
1180 if (
1181 step->is_named &&
1182 !self->language->symbol_metadata[visible_symbol].named
1183 ) does_match = false;
1184 } else if (step->symbol != visible_symbol) {
1185 does_match = false;
1186 }
1187 if (step->field && step->field != field_id) {
1188 does_match = false;
1189 }
1190 if (
1191 step->supertype_symbol &&
1192 !analysis_state__has_supertype(state, step->supertype_symbol)
1193 ) does_match = false;
1194 }
1195
1196 // If this child is hidden, then descend into it and walk through its children.
1197 // If the top entry of the stack is at the end of its rule, then that entry can
1198 // be replaced. Otherwise, push a new entry onto the stack.
1199 else if (sym >= self->language->token_count) {
1200 if (!next_state_top->done) {
1201 if (next_state.depth + 1 >= MAX_ANALYSIS_STATE_DEPTH) {
1202 #ifdef DEBUG_ANALYZE_QUERY
1203 printf("Exceeded depth limit for state %u\n", j);
1204 #endif
1205
1206 did_exceed_max_depth = true;
1207 continue;
1208 }
1209
1210 next_state.depth++;
1211 next_state_top = analysis_state__top(&next_state);
1212 }
1213
1214 *next_state_top = (AnalysisStateEntry) {
1215 .parse_state = parse_state,
1216 .parent_symbol = sym,
1217 .child_index = 0,
1218 .field_id = field_id,
1219 .done = false,
1220 };
1221
1222 if (analysis_state__recursion_depth(&next_state) > recursion_depth_limit) {
1223 array_insert_sorted_with(&deeper_states, analysis_state__compare, next_state);
1224 continue;
1225 }
1226 }
1227
1228 // Pop from the stack when this state reached the end of its current syntax node.
1229 while (next_state.depth > 0 && next_state_top->done) {
1230 next_state.depth--;
1231 next_state_top = analysis_state__top(&next_state);
1232 }
1233
1234 // If this hypothetical child did match the current step of the query pattern,
1235 // then advance to the next step at the current depth. This involves skipping
1236 // over any descendant steps of the current child.
1237 const QueryStep *next_step = step;
1238 if (does_match) {
1239 for (;;) {
1240 next_state.step_index++;
1241 next_step = &self->steps.contents[next_state.step_index];
1242 if (
1243 next_step->depth == PATTERN_DONE_MARKER ||
1244 next_step->depth <= parent_depth + 1
1245 ) break;
1246 }
1247 } else if (successor.state == parse_state) {
1248 continue;
1249 }
1250
1251 for (;;) {
1252 // Skip pass-through states. Although these states have alternatives, they are only
1253 // used to implement repetitions, and query analysis does not need to process
1254 // repetitions in order to determine whether steps are possible and definite.
1255 if (next_step->is_pass_through) {
1256 next_state.step_index++;
1257 next_step++;
1258 continue;
1259 }
1260
1261 // If the pattern is finished or hypothetical parent node is complete, then
1262 // record that matching can terminate at this step of the pattern. Otherwise,
1263 // add this state to the list of states to process on the next iteration.
1264 if (!next_step->is_dead_end) {
1265 bool did_finish_pattern = self->steps.contents[next_state.step_index].depth != parent_depth + 1;
1266 if (did_finish_pattern) can_finish_pattern = true;
1267 if (did_finish_pattern || next_state.depth == 0) {
1268 array_insert_sorted_by(&final_step_indices, , next_state.step_index);
1269 } else {
1270 array_insert_sorted_with(&next_states, analysis_state__compare, next_state);
1271 }
1272 }
1273
1274 // If the state has advanced to a step with an alternative step, then add another state
1275 // at that alternative step. This process is simpler than the process of actually matching a
1276 // pattern during query exection, because for the purposes of query analysis, there is no
1277 // need to process repetitions.
1278 if (
1279 does_match &&
1280 next_step->alternative_index != NONE &&
1281 next_step->alternative_index > next_state.step_index
1282 ) {
1283 next_state.step_index = next_step->alternative_index;
1284 next_step = &self->steps.contents[next_state.step_index];
1285 } else {
1286 break;
1287 }
1288 }
1289 }
1290 }
1291 }
1292
1293 AnalysisStateSet _states = states;
1294 states = next_states;
1295 next_states = _states;
1296 }
1297
1298 // Mark as indefinite any step where a match terminated.
1299 // Later, this property will be propagated to all of the step's predecessors.
1300 for (unsigned j = 0; j < final_step_indices.size; j++) {
1301 uint32_t final_step_index = final_step_indices.contents[j];
1302 QueryStep *step = &self->steps.contents[final_step_index];
1303 if (
1304 step->depth != PATTERN_DONE_MARKER &&
1305 step->depth > parent_depth &&
1306 !step->is_dead_end
1307 ) {
1308 step->parent_pattern_guaranteed = false;
1309 step->root_pattern_guaranteed = false;
1310 }
1311 }
1312
1313 if (did_exceed_max_depth) {
1314 for (unsigned j = parent_step_index + 1; j < self->steps.size; j++) {
1315 QueryStep *step = &self->steps.contents[j];
1316 if (
1317 step->depth <= parent_depth ||
1318 step->depth == PATTERN_DONE_MARKER
1319 ) break;
1320 if (!step->is_dead_end) {
1321 step->parent_pattern_guaranteed = false;
1322 step->root_pattern_guaranteed = false;
1323 }
1324 }
1325 }
1326
1327 // If this pattern cannot match, store the pattern index so that it can be
1328 // returned to the caller.
1329 if (all_patterns_are_valid && !can_finish_pattern && !did_exceed_max_depth) {
1330 assert(final_step_indices.size > 0);
1331 uint16_t impossible_step_index = *array_back(&final_step_indices);
1332 uint32_t i, exists;
1333 array_search_sorted_by(&self->step_offsets, .step_index, impossible_step_index, &i, &exists);
1334 if (i >= self->step_offsets.size) i = self->step_offsets.size - 1;
1335 *error_offset = self->step_offsets.contents[i].byte_offset;
1336 all_patterns_are_valid = false;
1337 break;
1338 }
1339 }
1340
1341 // Mark as indefinite any step with captures that are used in predicates.
1342 Array(uint16_t) predicate_capture_ids = array_new();
1343 for (unsigned i = 0; i < self->patterns.size; i++) {
1344 QueryPattern *pattern = &self->patterns.contents[i];
1345
1346 // Gather all of the captures that are used in predicates for this pattern.
1347 array_clear(&predicate_capture_ids);
1348 for (
1349 unsigned start = pattern->predicate_steps.offset,
1350 end = start + pattern->predicate_steps.length,
1351 j = start; j < end; j++
1352 ) {
1353 TSQueryPredicateStep *step = &self->predicate_steps.contents[j];
1354 if (step->type == TSQueryPredicateStepTypeCapture) {
1355 array_insert_sorted_by(&predicate_capture_ids, , step->value_id);
1356 }
1357 }
1358
1359 // Find all of the steps that have these captures.
1360 for (
1361 unsigned start = pattern->steps.offset,
1362 end = start + pattern->steps.length,
1363 j = start; j < end; j++
1364 ) {
1365 QueryStep *step = &self->steps.contents[j];
1366 for (unsigned k = 0; k < MAX_STEP_CAPTURE_COUNT; k++) {
1367 uint16_t capture_id = step->capture_ids[k];
1368 if (capture_id == NONE) break;
1369 unsigned index, exists;
1370 array_search_sorted_by(&predicate_capture_ids, , capture_id, &index, &exists);
1371 if (exists) {
1372 step->root_pattern_guaranteed = false;
1373 break;
1374 }
1375 }
1376 }
1377 }
1378
1379 // Propagate fallibility. If a pattern is fallible at a given step, then it is
1380 // fallible at all of its preceding steps.
1381 bool done = self->steps.size == 0;
1382 while (!done) {
1383 done = true;
1384 for (unsigned i = self->steps.size - 1; i > 0; i--) {
1385 QueryStep *step = &self->steps.contents[i];
1386 if (step->depth == PATTERN_DONE_MARKER) continue;
1387
1388 // Determine if this step is definite or has definite alternatives.
1389 bool parent_pattern_guaranteed = false;
1390 for (;;) {
1391 if (step->root_pattern_guaranteed) {
1392 parent_pattern_guaranteed = true;
1393 break;
1394 }
1395 if (step->alternative_index == NONE || step->alternative_index < i) {
1396 break;
1397 }
1398 step = &self->steps.contents[step->alternative_index];
1399 }
1400
1401 // If not, mark its predecessor as indefinite.
1402 if (!parent_pattern_guaranteed) {
1403 QueryStep *prev_step = &self->steps.contents[i - 1];
1404 if (
1405 !prev_step->is_dead_end &&
1406 prev_step->depth != PATTERN_DONE_MARKER &&
1407 prev_step->root_pattern_guaranteed
1408 ) {
1409 prev_step->root_pattern_guaranteed = false;
1410 done = false;
1411 }
1412 }
1413 }
1414 }
1415
1416 #ifdef DEBUG_ANALYZE_QUERY
1417 printf("Steps:\n");
1418 for (unsigned i = 0; i < self->steps.size; i++) {
1419 QueryStep *step = &self->steps.contents[i];
1420 if (step->depth == PATTERN_DONE_MARKER) {
1421 printf(" %u: DONE\n", i);
1422 } else {
1423 printf(
1424 " %u: {symbol: %s, field: %s, depth: %u, parent_pattern_guaranteed: %d, root_pattern_guaranteed: %d}\n",
1425 i,
1426 (step->symbol == WILDCARD_SYMBOL)
1427 ? "ANY"
1428 : ts_language_symbol_name(self->language, step->symbol),
1429 (step->field ? ts_language_field_name_for_id(self->language, step->field) : "-"),
1430 step->depth,
1431 step->parent_pattern_guaranteed,
1432 step->root_pattern_guaranteed
1433 );
1434 }
1435 }
1436 #endif
1437
1438 // Cleanup
1439 for (unsigned i = 0; i < subgraphs.size; i++) {
1440 array_delete(&subgraphs.contents[i].start_states);
1441 array_delete(&subgraphs.contents[i].nodes);
1442 }
1443 array_delete(&subgraphs);
1444 array_delete(&next_nodes);
1445 array_delete(&states);
1446 array_delete(&next_states);
1447 array_delete(&deeper_states);
1448 array_delete(&final_step_indices);
1449 array_delete(&parent_step_indices);
1450 array_delete(&predicate_capture_ids);
1451 state_predecessor_map_delete(&predecessor_map);
1452
1453 return all_patterns_are_valid;
1454 }
1455
ts_query__add_negated_fields(TSQuery * self,uint16_t step_index,TSFieldId * field_ids,uint16_t field_count)1456 static void ts_query__add_negated_fields(
1457 TSQuery *self,
1458 uint16_t step_index,
1459 TSFieldId *field_ids,
1460 uint16_t field_count
1461 ) {
1462 QueryStep *step = &self->steps.contents[step_index];
1463
1464 // The negated field array stores a list of field lists, separated by zeros.
1465 // Try to find the start index of an existing list that matches this new list.
1466 bool failed_match = false;
1467 unsigned match_count = 0;
1468 unsigned start_i = 0;
1469 for (unsigned i = 0; i < self->negated_fields.size; i++) {
1470 TSFieldId existing_field_id = self->negated_fields.contents[i];
1471
1472 // At each zero value, terminate the match attempt. If we've exactly
1473 // matched the new field list, then reuse this index. Otherwise,
1474 // start over the matching process.
1475 if (existing_field_id == 0) {
1476 if (match_count == field_count) {
1477 step->negated_field_list_id = start_i;
1478 return;
1479 } else {
1480 start_i = i + 1;
1481 match_count = 0;
1482 failed_match = false;
1483 }
1484 }
1485
1486 // If the existing list matches our new list so far, then advance
1487 // to the next element of the new list.
1488 else if (
1489 match_count < field_count &&
1490 existing_field_id == field_ids[match_count] &&
1491 !failed_match
1492 ) {
1493 match_count++;
1494 }
1495
1496 // Otherwise, this existing list has failed to match.
1497 else {
1498 match_count = 0;
1499 failed_match = true;
1500 }
1501 }
1502
1503 step->negated_field_list_id = self->negated_fields.size;
1504 array_extend(&self->negated_fields, field_count, field_ids);
1505 array_push(&self->negated_fields, 0);
1506 }
1507
ts_query__parse_string_literal(TSQuery * self,Stream * stream)1508 static TSQueryError ts_query__parse_string_literal(
1509 TSQuery *self,
1510 Stream *stream
1511 ) {
1512 const char *string_start = stream->input;
1513 if (stream->next != '"') return TSQueryErrorSyntax;
1514 stream_advance(stream);
1515 const char *prev_position = stream->input;
1516
1517 bool is_escaped = false;
1518 array_clear(&self->string_buffer);
1519 for (;;) {
1520 if (is_escaped) {
1521 is_escaped = false;
1522 switch (stream->next) {
1523 case 'n':
1524 array_push(&self->string_buffer, '\n');
1525 break;
1526 case 'r':
1527 array_push(&self->string_buffer, '\r');
1528 break;
1529 case 't':
1530 array_push(&self->string_buffer, '\t');
1531 break;
1532 case '0':
1533 array_push(&self->string_buffer, '\0');
1534 break;
1535 default:
1536 array_extend(&self->string_buffer, stream->next_size, stream->input);
1537 break;
1538 }
1539 prev_position = stream->input + stream->next_size;
1540 } else {
1541 if (stream->next == '\\') {
1542 array_extend(&self->string_buffer, (stream->input - prev_position), prev_position);
1543 prev_position = stream->input + 1;
1544 is_escaped = true;
1545 } else if (stream->next == '"') {
1546 array_extend(&self->string_buffer, (stream->input - prev_position), prev_position);
1547 stream_advance(stream);
1548 return TSQueryErrorNone;
1549 } else if (stream->next == '\n') {
1550 stream_reset(stream, string_start);
1551 return TSQueryErrorSyntax;
1552 }
1553 }
1554 if (!stream_advance(stream)) {
1555 stream_reset(stream, string_start);
1556 return TSQueryErrorSyntax;
1557 }
1558 }
1559 }
1560
1561 // Parse a single predicate associated with a pattern, adding it to the
1562 // query's internal `predicate_steps` array. Predicates are arbitrary
1563 // S-expressions associated with a pattern which are meant to be handled at
1564 // a higher level of abstraction, such as the Rust/JavaScript bindings. They
1565 // can contain '@'-prefixed capture names, double-quoted strings, and bare
1566 // symbols, which also represent strings.
ts_query__parse_predicate(TSQuery * self,Stream * stream)1567 static TSQueryError ts_query__parse_predicate(
1568 TSQuery *self,
1569 Stream *stream
1570 ) {
1571 if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
1572 const char *predicate_name = stream->input;
1573 stream_scan_identifier(stream);
1574 uint32_t length = stream->input - predicate_name;
1575 uint16_t id = symbol_table_insert_name(
1576 &self->predicate_values,
1577 predicate_name,
1578 length
1579 );
1580 array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1581 .type = TSQueryPredicateStepTypeString,
1582 .value_id = id,
1583 }));
1584 stream_skip_whitespace(stream);
1585
1586 for (;;) {
1587 if (stream->next == ')') {
1588 stream_advance(stream);
1589 stream_skip_whitespace(stream);
1590 array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1591 .type = TSQueryPredicateStepTypeDone,
1592 .value_id = 0,
1593 }));
1594 break;
1595 }
1596
1597 // Parse an '@'-prefixed capture name
1598 else if (stream->next == '@') {
1599 stream_advance(stream);
1600
1601 // Parse the capture name
1602 if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
1603 const char *capture_name = stream->input;
1604 stream_scan_identifier(stream);
1605 uint32_t length = stream->input - capture_name;
1606
1607 // Add the capture id to the first step of the pattern
1608 int capture_id = symbol_table_id_for_name(
1609 &self->captures,
1610 capture_name,
1611 length
1612 );
1613 if (capture_id == -1) {
1614 stream_reset(stream, capture_name);
1615 return TSQueryErrorCapture;
1616 }
1617
1618 array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1619 .type = TSQueryPredicateStepTypeCapture,
1620 .value_id = capture_id,
1621 }));
1622 }
1623
1624 // Parse a string literal
1625 else if (stream->next == '"') {
1626 TSQueryError e = ts_query__parse_string_literal(self, stream);
1627 if (e) return e;
1628 uint16_t id = symbol_table_insert_name(
1629 &self->predicate_values,
1630 self->string_buffer.contents,
1631 self->string_buffer.size
1632 );
1633 array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1634 .type = TSQueryPredicateStepTypeString,
1635 .value_id = id,
1636 }));
1637 }
1638
1639 // Parse a bare symbol
1640 else if (stream_is_ident_start(stream)) {
1641 const char *symbol_start = stream->input;
1642 stream_scan_identifier(stream);
1643 uint32_t length = stream->input - symbol_start;
1644 uint16_t id = symbol_table_insert_name(
1645 &self->predicate_values,
1646 symbol_start,
1647 length
1648 );
1649 array_push(&self->predicate_steps, ((TSQueryPredicateStep) {
1650 .type = TSQueryPredicateStepTypeString,
1651 .value_id = id,
1652 }));
1653 }
1654
1655 else {
1656 return TSQueryErrorSyntax;
1657 }
1658
1659 stream_skip_whitespace(stream);
1660 }
1661
1662 return 0;
1663 }
1664
1665 // Read one S-expression pattern from the stream, and incorporate it into
1666 // the query's internal state machine representation. For nested patterns,
1667 // this function calls itself recursively.
ts_query__parse_pattern(TSQuery * self,Stream * stream,uint32_t depth,bool is_immediate)1668 static TSQueryError ts_query__parse_pattern(
1669 TSQuery *self,
1670 Stream *stream,
1671 uint32_t depth,
1672 bool is_immediate
1673 ) {
1674 if (stream->next == 0) return TSQueryErrorSyntax;
1675 if (stream->next == ')' || stream->next == ']') return PARENT_DONE;
1676
1677 const uint32_t starting_step_index = self->steps.size;
1678
1679 // Store the byte offset of each step in the query.
1680 if (
1681 self->step_offsets.size == 0 ||
1682 array_back(&self->step_offsets)->step_index != starting_step_index
1683 ) {
1684 array_push(&self->step_offsets, ((StepOffset) {
1685 .step_index = starting_step_index,
1686 .byte_offset = stream_offset(stream),
1687 }));
1688 }
1689
1690 // An open bracket is the start of an alternation.
1691 if (stream->next == '[') {
1692 stream_advance(stream);
1693 stream_skip_whitespace(stream);
1694
1695 // Parse each branch, and add a placeholder step in between the branches.
1696 Array(uint32_t) branch_step_indices = array_new();
1697 for (;;) {
1698 uint32_t start_index = self->steps.size;
1699 TSQueryError e = ts_query__parse_pattern(
1700 self,
1701 stream,
1702 depth,
1703 is_immediate
1704 );
1705
1706 if (e == PARENT_DONE) {
1707 if (stream->next == ']' && branch_step_indices.size > 0) {
1708 stream_advance(stream);
1709 break;
1710 }
1711 e = TSQueryErrorSyntax;
1712 }
1713 if (e) {
1714 array_delete(&branch_step_indices);
1715 return e;
1716 }
1717
1718 array_push(&branch_step_indices, start_index);
1719 array_push(&self->steps, query_step__new(0, depth, false));
1720 }
1721 (void)array_pop(&self->steps);
1722
1723 // For all of the branches except for the last one, add the subsequent branch as an
1724 // alternative, and link the end of the branch to the current end of the steps.
1725 for (unsigned i = 0; i < branch_step_indices.size - 1; i++) {
1726 uint32_t step_index = branch_step_indices.contents[i];
1727 uint32_t next_step_index = branch_step_indices.contents[i + 1];
1728 QueryStep *start_step = &self->steps.contents[step_index];
1729 QueryStep *end_step = &self->steps.contents[next_step_index - 1];
1730 start_step->alternative_index = next_step_index;
1731 end_step->alternative_index = self->steps.size;
1732 end_step->is_dead_end = true;
1733 }
1734
1735 array_delete(&branch_step_indices);
1736 }
1737
1738 // An open parenthesis can be the start of three possible constructs:
1739 // * A grouped sequence
1740 // * A predicate
1741 // * A named node
1742 else if (stream->next == '(') {
1743 stream_advance(stream);
1744 stream_skip_whitespace(stream);
1745
1746 // If this parenthesis is followed by a node, then it represents a grouped sequence.
1747 if (stream->next == '(' || stream->next == '"' || stream->next == '[') {
1748 bool child_is_immediate = false;
1749 for (;;) {
1750 if (stream->next == '.') {
1751 child_is_immediate = true;
1752 stream_advance(stream);
1753 stream_skip_whitespace(stream);
1754 }
1755 TSQueryError e = ts_query__parse_pattern(
1756 self,
1757 stream,
1758 depth,
1759 child_is_immediate
1760 );
1761 if (e == PARENT_DONE) {
1762 if (stream->next == ')') {
1763 stream_advance(stream);
1764 break;
1765 }
1766 e = TSQueryErrorSyntax;
1767 }
1768 if (e) return e;
1769
1770 child_is_immediate = false;
1771 }
1772 }
1773
1774 // A dot/pound character indicates the start of a predicate.
1775 else if (stream->next == '.' || stream->next == '#') {
1776 stream_advance(stream);
1777 return ts_query__parse_predicate(self, stream);
1778 }
1779
1780 // Otherwise, this parenthesis is the start of a named node.
1781 else {
1782 TSSymbol symbol;
1783
1784 // Parse a normal node name
1785 if (stream_is_ident_start(stream)) {
1786 const char *node_name = stream->input;
1787 stream_scan_identifier(stream);
1788 uint32_t length = stream->input - node_name;
1789
1790 // TODO - remove.
1791 // For temporary backward compatibility, handle predicates without the leading '#' sign.
1792 if (length > 0 && (node_name[length - 1] == '!' || node_name[length - 1] == '?')) {
1793 stream_reset(stream, node_name);
1794 return ts_query__parse_predicate(self, stream);
1795 }
1796
1797 // Parse the wildcard symbol
1798 else if (length == 1 && node_name[0] == '_') {
1799 symbol = WILDCARD_SYMBOL;
1800 }
1801
1802 else {
1803 symbol = ts_language_symbol_for_name(
1804 self->language,
1805 node_name,
1806 length,
1807 true
1808 );
1809 if (!symbol) {
1810 stream_reset(stream, node_name);
1811 return TSQueryErrorNodeType;
1812 }
1813 }
1814 } else {
1815 return TSQueryErrorSyntax;
1816 }
1817
1818 // Add a step for the node.
1819 array_push(&self->steps, query_step__new(symbol, depth, is_immediate));
1820 QueryStep *step = array_back(&self->steps);
1821 if (ts_language_symbol_metadata(self->language, symbol).supertype) {
1822 step->supertype_symbol = step->symbol;
1823 step->symbol = WILDCARD_SYMBOL;
1824 }
1825 if (symbol == WILDCARD_SYMBOL) {
1826 step->is_named = true;
1827 }
1828
1829 stream_skip_whitespace(stream);
1830
1831 if (stream->next == '/') {
1832 stream_advance(stream);
1833 if (!stream_is_ident_start(stream)) {
1834 return TSQueryErrorSyntax;
1835 }
1836
1837 const char *node_name = stream->input;
1838 stream_scan_identifier(stream);
1839 uint32_t length = stream->input - node_name;
1840
1841 step->symbol = ts_language_symbol_for_name(
1842 self->language,
1843 node_name,
1844 length,
1845 true
1846 );
1847 if (!step->symbol) {
1848 stream_reset(stream, node_name);
1849 return TSQueryErrorNodeType;
1850 }
1851
1852 stream_skip_whitespace(stream);
1853 }
1854
1855 // Parse the child patterns
1856 bool child_is_immediate = false;
1857 uint16_t last_child_step_index = 0;
1858 uint16_t negated_field_count = 0;
1859 TSFieldId negated_field_ids[MAX_NEGATED_FIELD_COUNT];
1860 for (;;) {
1861 // Parse a negated field assertion
1862 if (stream->next == '!') {
1863 stream_advance(stream);
1864 stream_skip_whitespace(stream);
1865 if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
1866 const char *field_name = stream->input;
1867 stream_scan_identifier(stream);
1868 uint32_t length = stream->input - field_name;
1869 stream_skip_whitespace(stream);
1870
1871 TSFieldId field_id = ts_language_field_id_for_name(
1872 self->language,
1873 field_name,
1874 length
1875 );
1876 if (!field_id) {
1877 stream->input = field_name;
1878 return TSQueryErrorField;
1879 }
1880
1881 // Keep the field ids sorted.
1882 if (negated_field_count < MAX_NEGATED_FIELD_COUNT) {
1883 negated_field_ids[negated_field_count] = field_id;
1884 negated_field_count++;
1885 }
1886
1887 continue;
1888 }
1889
1890 // Parse a sibling anchor
1891 if (stream->next == '.') {
1892 child_is_immediate = true;
1893 stream_advance(stream);
1894 stream_skip_whitespace(stream);
1895 }
1896
1897 uint16_t step_index = self->steps.size;
1898 TSQueryError e = ts_query__parse_pattern(
1899 self,
1900 stream,
1901 depth + 1,
1902 child_is_immediate
1903 );
1904 if (e == PARENT_DONE) {
1905 if (stream->next == ')') {
1906 if (child_is_immediate) {
1907 if (last_child_step_index == 0) return TSQueryErrorSyntax;
1908 self->steps.contents[last_child_step_index].is_last_child = true;
1909 }
1910
1911 if (negated_field_count) {
1912 ts_query__add_negated_fields(
1913 self,
1914 starting_step_index,
1915 negated_field_ids,
1916 negated_field_count
1917 );
1918 }
1919
1920 stream_advance(stream);
1921 break;
1922 }
1923 e = TSQueryErrorSyntax;
1924 }
1925 if (e) return e;
1926
1927 last_child_step_index = step_index;
1928 child_is_immediate = false;
1929 }
1930 }
1931 }
1932
1933 // Parse a wildcard pattern
1934 else if (stream->next == '_') {
1935 stream_advance(stream);
1936 stream_skip_whitespace(stream);
1937
1938 // Add a step that matches any kind of node
1939 array_push(&self->steps, query_step__new(WILDCARD_SYMBOL, depth, is_immediate));
1940 }
1941
1942 // Parse a double-quoted anonymous leaf node expression
1943 else if (stream->next == '"') {
1944 const char *string_start = stream->input;
1945 TSQueryError e = ts_query__parse_string_literal(self, stream);
1946 if (e) return e;
1947
1948 // Add a step for the node
1949 TSSymbol symbol = ts_language_symbol_for_name(
1950 self->language,
1951 self->string_buffer.contents,
1952 self->string_buffer.size,
1953 false
1954 );
1955 if (!symbol) {
1956 stream_reset(stream, string_start + 1);
1957 return TSQueryErrorNodeType;
1958 }
1959 array_push(&self->steps, query_step__new(symbol, depth, is_immediate));
1960 }
1961
1962 // Parse a field-prefixed pattern
1963 else if (stream_is_ident_start(stream)) {
1964 // Parse the field name
1965 const char *field_name = stream->input;
1966 stream_scan_identifier(stream);
1967 uint32_t length = stream->input - field_name;
1968 stream_skip_whitespace(stream);
1969
1970 if (stream->next != ':') {
1971 stream_reset(stream, field_name);
1972 return TSQueryErrorSyntax;
1973 }
1974 stream_advance(stream);
1975 stream_skip_whitespace(stream);
1976
1977 // Parse the pattern
1978 TSQueryError e = ts_query__parse_pattern(
1979 self,
1980 stream,
1981 depth,
1982 is_immediate
1983 );
1984 if (e == PARENT_DONE) return TSQueryErrorSyntax;
1985 if (e) return e;
1986
1987 // Add the field name to the first step of the pattern
1988 TSFieldId field_id = ts_language_field_id_for_name(
1989 self->language,
1990 field_name,
1991 length
1992 );
1993 if (!field_id) {
1994 stream->input = field_name;
1995 return TSQueryErrorField;
1996 }
1997
1998 uint32_t step_index = starting_step_index;
1999 QueryStep *step = &self->steps.contents[step_index];
2000 for (;;) {
2001 step->field = field_id;
2002 if (
2003 step->alternative_index != NONE &&
2004 step->alternative_index > step_index &&
2005 step->alternative_index < self->steps.size
2006 ) {
2007 step_index = step->alternative_index;
2008 step = &self->steps.contents[step_index];
2009 } else {
2010 break;
2011 }
2012 }
2013 }
2014
2015 else {
2016 return TSQueryErrorSyntax;
2017 }
2018
2019 stream_skip_whitespace(stream);
2020
2021 // Parse suffixes modifiers for this pattern
2022 for (;;) {
2023 // Parse the one-or-more operator.
2024 if (stream->next == '+') {
2025 stream_advance(stream);
2026 stream_skip_whitespace(stream);
2027
2028 QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false);
2029 repeat_step.alternative_index = starting_step_index;
2030 repeat_step.is_pass_through = true;
2031 repeat_step.alternative_is_immediate = true;
2032 array_push(&self->steps, repeat_step);
2033 }
2034
2035 // Parse the zero-or-more repetition operator.
2036 else if (stream->next == '*') {
2037 stream_advance(stream);
2038 stream_skip_whitespace(stream);
2039
2040 QueryStep repeat_step = query_step__new(WILDCARD_SYMBOL, depth, false);
2041 repeat_step.alternative_index = starting_step_index;
2042 repeat_step.is_pass_through = true;
2043 repeat_step.alternative_is_immediate = true;
2044 array_push(&self->steps, repeat_step);
2045
2046 QueryStep *step = &self->steps.contents[starting_step_index];
2047 while (step->alternative_index != NONE) {
2048 step = &self->steps.contents[step->alternative_index];
2049 }
2050 step->alternative_index = self->steps.size;
2051 }
2052
2053 // Parse the optional operator.
2054 else if (stream->next == '?') {
2055 stream_advance(stream);
2056 stream_skip_whitespace(stream);
2057
2058 QueryStep *step = &self->steps.contents[starting_step_index];
2059 while (step->alternative_index != NONE) {
2060 step = &self->steps.contents[step->alternative_index];
2061 }
2062 step->alternative_index = self->steps.size;
2063 }
2064
2065 // Parse an '@'-prefixed capture pattern
2066 else if (stream->next == '@') {
2067 stream_advance(stream);
2068 if (!stream_is_ident_start(stream)) return TSQueryErrorSyntax;
2069 const char *capture_name = stream->input;
2070 stream_scan_identifier(stream);
2071 uint32_t length = stream->input - capture_name;
2072 stream_skip_whitespace(stream);
2073
2074 // Add the capture id to the first step of the pattern
2075 uint16_t capture_id = symbol_table_insert_name(
2076 &self->captures,
2077 capture_name,
2078 length
2079 );
2080
2081 uint32_t step_index = starting_step_index;
2082 for (;;) {
2083 QueryStep *step = &self->steps.contents[step_index];
2084 query_step__add_capture(step, capture_id);
2085 if (
2086 step->alternative_index != NONE &&
2087 step->alternative_index > step_index &&
2088 step->alternative_index < self->steps.size
2089 ) {
2090 step_index = step->alternative_index;
2091 step = &self->steps.contents[step_index];
2092 } else {
2093 break;
2094 }
2095 }
2096 }
2097
2098 // No more suffix modifiers
2099 else {
2100 break;
2101 }
2102 }
2103
2104 return 0;
2105 }
2106
ts_query_new(const TSLanguage * language,const char * source,uint32_t source_len,uint32_t * error_offset,TSQueryError * error_type)2107 TSQuery *ts_query_new(
2108 const TSLanguage *language,
2109 const char *source,
2110 uint32_t source_len,
2111 uint32_t *error_offset,
2112 TSQueryError *error_type
2113 ) {
2114 if (
2115 !language ||
2116 language->version > TREE_SITTER_LANGUAGE_VERSION ||
2117 language->version < TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION
2118 ) {
2119 *error_type = TSQueryErrorLanguage;
2120 return NULL;
2121 }
2122
2123 TSQuery *self = ts_malloc(sizeof(TSQuery));
2124 *self = (TSQuery) {
2125 .steps = array_new(),
2126 .pattern_map = array_new(),
2127 .captures = symbol_table_new(),
2128 .predicate_values = symbol_table_new(),
2129 .predicate_steps = array_new(),
2130 .patterns = array_new(),
2131 .step_offsets = array_new(),
2132 .string_buffer = array_new(),
2133 .negated_fields = array_new(),
2134 .wildcard_root_pattern_count = 0,
2135 .language = language,
2136 };
2137
2138 array_push(&self->negated_fields, 0);
2139
2140 // Parse all of the S-expressions in the given string.
2141 Stream stream = stream_new(source, source_len);
2142 stream_skip_whitespace(&stream);
2143 while (stream.input < stream.end) {
2144 uint32_t pattern_index = self->patterns.size;
2145 uint32_t start_step_index = self->steps.size;
2146 uint32_t start_predicate_step_index = self->predicate_steps.size;
2147 array_push(&self->patterns, ((QueryPattern) {
2148 .steps = (Slice) {.offset = start_step_index},
2149 .predicate_steps = (Slice) {.offset = start_predicate_step_index},
2150 .start_byte = stream_offset(&stream),
2151 }));
2152 *error_type = ts_query__parse_pattern(self, &stream, 0, false);
2153 array_push(&self->steps, query_step__new(0, PATTERN_DONE_MARKER, false));
2154
2155 QueryPattern *pattern = array_back(&self->patterns);
2156 pattern->steps.length = self->steps.size - start_step_index;
2157 pattern->predicate_steps.length = self->predicate_steps.size - start_predicate_step_index;
2158
2159 // If any pattern could not be parsed, then report the error information
2160 // and terminate.
2161 if (*error_type) {
2162 if (*error_type == PARENT_DONE) *error_type = TSQueryErrorSyntax;
2163 *error_offset = stream_offset(&stream);
2164 ts_query_delete(self);
2165 return NULL;
2166 }
2167
2168 // Maintain a map that can look up patterns for a given root symbol.
2169 uint16_t wildcard_root_alternative_index = NONE;
2170 for (;;) {
2171 QueryStep *step = &self->steps.contents[start_step_index];
2172
2173 // If a pattern has a wildcard at its root, but it has a non-wildcard child,
2174 // then optimize the matching process by skipping matching the wildcard.
2175 // Later, during the matching process, the query cursor will check that
2176 // there is a parent node, and capture it if necessary.
2177 if (step->symbol == WILDCARD_SYMBOL && step->depth == 0 && !step->field) {
2178 QueryStep *second_step = &self->steps.contents[start_step_index + 1];
2179 if (second_step->symbol != WILDCARD_SYMBOL && second_step->depth == 1) {
2180 wildcard_root_alternative_index = step->alternative_index;
2181 start_step_index += 1;
2182 step = second_step;
2183 }
2184 }
2185
2186 // Determine whether the pattern has a single root node. This affects
2187 // decisions about whether or not to start matching the pattern when
2188 // a query cursor has a range restriction.
2189 bool is_rooted = true;
2190 uint32_t start_depth = step->depth;
2191 for (uint32_t step_index = start_step_index + 1; step_index < self->steps.size; step_index++) {
2192 QueryStep *step = &self->steps.contents[step_index];
2193 if (step->depth == start_depth) {
2194 is_rooted = false;
2195 break;
2196 }
2197 }
2198
2199 ts_query__pattern_map_insert(self, step->symbol, (PatternEntry) {
2200 .step_index = start_step_index,
2201 .pattern_index = pattern_index,
2202 .is_rooted = is_rooted
2203 });
2204 if (step->symbol == WILDCARD_SYMBOL) {
2205 self->wildcard_root_pattern_count++;
2206 }
2207
2208 // If there are alternatives or options at the root of the pattern,
2209 // then add multiple entries to the pattern map.
2210 if (step->alternative_index != NONE) {
2211 start_step_index = step->alternative_index;
2212 step->alternative_index = NONE;
2213 } else if (wildcard_root_alternative_index != NONE) {
2214 start_step_index = wildcard_root_alternative_index;
2215 wildcard_root_alternative_index = NONE;
2216 } else {
2217 break;
2218 }
2219 }
2220 }
2221
2222 if (!ts_query__analyze_patterns(self, error_offset)) {
2223 *error_type = TSQueryErrorStructure;
2224 ts_query_delete(self);
2225 return NULL;
2226 }
2227
2228 array_delete(&self->string_buffer);
2229 return self;
2230 }
2231
ts_query_delete(TSQuery * self)2232 void ts_query_delete(TSQuery *self) {
2233 if (self) {
2234 array_delete(&self->steps);
2235 array_delete(&self->pattern_map);
2236 array_delete(&self->predicate_steps);
2237 array_delete(&self->patterns);
2238 array_delete(&self->step_offsets);
2239 array_delete(&self->string_buffer);
2240 array_delete(&self->negated_fields);
2241 symbol_table_delete(&self->captures);
2242 symbol_table_delete(&self->predicate_values);
2243 ts_free(self);
2244 }
2245 }
2246
ts_query_pattern_count(const TSQuery * self)2247 uint32_t ts_query_pattern_count(const TSQuery *self) {
2248 return self->patterns.size;
2249 }
2250
ts_query_capture_count(const TSQuery * self)2251 uint32_t ts_query_capture_count(const TSQuery *self) {
2252 return self->captures.slices.size;
2253 }
2254
ts_query_string_count(const TSQuery * self)2255 uint32_t ts_query_string_count(const TSQuery *self) {
2256 return self->predicate_values.slices.size;
2257 }
2258
ts_query_capture_name_for_id(const TSQuery * self,uint32_t index,uint32_t * length)2259 const char *ts_query_capture_name_for_id(
2260 const TSQuery *self,
2261 uint32_t index,
2262 uint32_t *length
2263 ) {
2264 return symbol_table_name_for_id(&self->captures, index, length);
2265 }
2266
ts_query_string_value_for_id(const TSQuery * self,uint32_t index,uint32_t * length)2267 const char *ts_query_string_value_for_id(
2268 const TSQuery *self,
2269 uint32_t index,
2270 uint32_t *length
2271 ) {
2272 return symbol_table_name_for_id(&self->predicate_values, index, length);
2273 }
2274
ts_query_predicates_for_pattern(const TSQuery * self,uint32_t pattern_index,uint32_t * step_count)2275 const TSQueryPredicateStep *ts_query_predicates_for_pattern(
2276 const TSQuery *self,
2277 uint32_t pattern_index,
2278 uint32_t *step_count
2279 ) {
2280 Slice slice = self->patterns.contents[pattern_index].predicate_steps;
2281 *step_count = slice.length;
2282 if (self->predicate_steps.contents == NULL) {
2283 return NULL;
2284 }
2285 return &self->predicate_steps.contents[slice.offset];
2286 }
2287
ts_query_start_byte_for_pattern(const TSQuery * self,uint32_t pattern_index)2288 uint32_t ts_query_start_byte_for_pattern(
2289 const TSQuery *self,
2290 uint32_t pattern_index
2291 ) {
2292 return self->patterns.contents[pattern_index].start_byte;
2293 }
2294
ts_query_is_pattern_guaranteed_at_step(const TSQuery * self,uint32_t byte_offset)2295 bool ts_query_is_pattern_guaranteed_at_step(
2296 const TSQuery *self,
2297 uint32_t byte_offset
2298 ) {
2299 uint32_t step_index = UINT32_MAX;
2300 for (unsigned i = 0; i < self->step_offsets.size; i++) {
2301 StepOffset *step_offset = &self->step_offsets.contents[i];
2302 if (step_offset->byte_offset > byte_offset) break;
2303 step_index = step_offset->step_index;
2304 }
2305 if (step_index < self->steps.size) {
2306 return self->steps.contents[step_index].root_pattern_guaranteed;
2307 } else {
2308 return false;
2309 }
2310 }
2311
ts_query__step_is_fallible(const TSQuery * self,uint16_t step_index)2312 bool ts_query__step_is_fallible(
2313 const TSQuery *self,
2314 uint16_t step_index
2315 ) {
2316 assert((uint32_t)step_index + 1 < self->steps.size);
2317 QueryStep *step = &self->steps.contents[step_index];
2318 QueryStep *next_step = &self->steps.contents[step_index + 1];
2319 return (
2320 next_step->depth != PATTERN_DONE_MARKER &&
2321 next_step->depth > step->depth &&
2322 !next_step->parent_pattern_guaranteed
2323 );
2324 }
2325
ts_query_disable_capture(TSQuery * self,const char * name,uint32_t length)2326 void ts_query_disable_capture(
2327 TSQuery *self,
2328 const char *name,
2329 uint32_t length
2330 ) {
2331 // Remove capture information for any pattern step that previously
2332 // captured with the given name.
2333 int id = symbol_table_id_for_name(&self->captures, name, length);
2334 if (id != -1) {
2335 for (unsigned i = 0; i < self->steps.size; i++) {
2336 QueryStep *step = &self->steps.contents[i];
2337 query_step__remove_capture(step, id);
2338 }
2339 }
2340 }
2341
ts_query_disable_pattern(TSQuery * self,uint32_t pattern_index)2342 void ts_query_disable_pattern(
2343 TSQuery *self,
2344 uint32_t pattern_index
2345 ) {
2346 // Remove the given pattern from the pattern map. Its steps will still
2347 // be in the `steps` array, but they will never be read.
2348 for (unsigned i = 0; i < self->pattern_map.size; i++) {
2349 PatternEntry *pattern = &self->pattern_map.contents[i];
2350 if (pattern->pattern_index == pattern_index) {
2351 array_erase(&self->pattern_map, i);
2352 i--;
2353 }
2354 }
2355 }
2356
2357 /***************
2358 * QueryCursor
2359 ***************/
2360
ts_query_cursor_new(void)2361 TSQueryCursor *ts_query_cursor_new(void) {
2362 TSQueryCursor *self = ts_malloc(sizeof(TSQueryCursor));
2363 *self = (TSQueryCursor) {
2364 .did_exceed_match_limit = false,
2365 .ascending = false,
2366 .halted = false,
2367 .states = array_new(),
2368 .finished_states = array_new(),
2369 .capture_list_pool = capture_list_pool_new(),
2370 .start_byte = 0,
2371 .end_byte = UINT32_MAX,
2372 .start_point = {0, 0},
2373 .end_point = POINT_MAX,
2374 };
2375 array_reserve(&self->states, 8);
2376 array_reserve(&self->finished_states, 8);
2377 return self;
2378 }
2379
ts_query_cursor_delete(TSQueryCursor * self)2380 void ts_query_cursor_delete(TSQueryCursor *self) {
2381 array_delete(&self->states);
2382 array_delete(&self->finished_states);
2383 ts_tree_cursor_delete(&self->cursor);
2384 capture_list_pool_delete(&self->capture_list_pool);
2385 ts_free(self);
2386 }
2387
ts_query_cursor_did_exceed_match_limit(const TSQueryCursor * self)2388 bool ts_query_cursor_did_exceed_match_limit(const TSQueryCursor *self) {
2389 return self->did_exceed_match_limit;
2390 }
2391
ts_query_cursor_match_limit(const TSQueryCursor * self)2392 uint32_t ts_query_cursor_match_limit(const TSQueryCursor *self) {
2393 return self->capture_list_pool.max_capture_list_count;
2394 }
2395
ts_query_cursor_set_match_limit(TSQueryCursor * self,uint32_t limit)2396 void ts_query_cursor_set_match_limit(TSQueryCursor *self, uint32_t limit) {
2397 self->capture_list_pool.max_capture_list_count = limit;
2398 }
2399
ts_query_cursor_exec(TSQueryCursor * self,const TSQuery * query,TSNode node)2400 void ts_query_cursor_exec(
2401 TSQueryCursor *self,
2402 const TSQuery *query,
2403 TSNode node
2404 ) {
2405 array_clear(&self->states);
2406 array_clear(&self->finished_states);
2407 ts_tree_cursor_reset(&self->cursor, node);
2408 capture_list_pool_reset(&self->capture_list_pool);
2409 self->next_state_id = 0;
2410 self->depth = 0;
2411 self->ascending = false;
2412 self->halted = false;
2413 self->query = query;
2414 self->did_exceed_match_limit = false;
2415 }
2416
ts_query_cursor_set_byte_range(TSQueryCursor * self,uint32_t start_byte,uint32_t end_byte)2417 void ts_query_cursor_set_byte_range(
2418 TSQueryCursor *self,
2419 uint32_t start_byte,
2420 uint32_t end_byte
2421 ) {
2422 if (end_byte == 0) {
2423 end_byte = UINT32_MAX;
2424 }
2425 self->start_byte = start_byte;
2426 self->end_byte = end_byte;
2427 }
2428
ts_query_cursor_set_point_range(TSQueryCursor * self,TSPoint start_point,TSPoint end_point)2429 void ts_query_cursor_set_point_range(
2430 TSQueryCursor *self,
2431 TSPoint start_point,
2432 TSPoint end_point
2433 ) {
2434 if (end_point.row == 0 && end_point.column == 0) {
2435 end_point = POINT_MAX;
2436 }
2437 self->start_point = start_point;
2438 self->end_point = end_point;
2439 }
2440
2441 // Search through all of the in-progress states, and find the captured
2442 // node that occurs earliest in the document.
ts_query_cursor__first_in_progress_capture(TSQueryCursor * self,uint32_t * state_index,uint32_t * byte_offset,uint32_t * pattern_index,bool * root_pattern_guaranteed)2443 static bool ts_query_cursor__first_in_progress_capture(
2444 TSQueryCursor *self,
2445 uint32_t *state_index,
2446 uint32_t *byte_offset,
2447 uint32_t *pattern_index,
2448 bool *root_pattern_guaranteed
2449 ) {
2450 bool result = false;
2451 *state_index = UINT32_MAX;
2452 *byte_offset = UINT32_MAX;
2453 *pattern_index = UINT32_MAX;
2454 for (unsigned i = 0; i < self->states.size; i++) {
2455 QueryState *state = &self->states.contents[i];
2456 if (state->dead) continue;
2457
2458 const CaptureList *captures = capture_list_pool_get(
2459 &self->capture_list_pool,
2460 state->capture_list_id
2461 );
2462 if (state->consumed_capture_count >= captures->size) {
2463 continue;
2464 }
2465
2466 TSNode node = captures->contents[state->consumed_capture_count].node;
2467 if (
2468 ts_node_end_byte(node) <= self->start_byte ||
2469 point_lte(ts_node_end_point(node), self->start_point)
2470 ) {
2471 state->consumed_capture_count++;
2472 i--;
2473 continue;
2474 }
2475
2476 uint32_t node_start_byte = ts_node_start_byte(node);
2477 if (
2478 !result ||
2479 node_start_byte < *byte_offset ||
2480 (node_start_byte == *byte_offset && state->pattern_index < *pattern_index)
2481 ) {
2482 QueryStep *step = &self->query->steps.contents[state->step_index];
2483 if (root_pattern_guaranteed) {
2484 *root_pattern_guaranteed = step->root_pattern_guaranteed;
2485 } else if (step->root_pattern_guaranteed) {
2486 continue;
2487 }
2488
2489 result = true;
2490 *state_index = i;
2491 *byte_offset = node_start_byte;
2492 *pattern_index = state->pattern_index;
2493 }
2494 }
2495 return result;
2496 }
2497
2498 // Determine which node is first in a depth-first traversal
ts_query_cursor__compare_nodes(TSNode left,TSNode right)2499 int ts_query_cursor__compare_nodes(TSNode left, TSNode right) {
2500 if (left.id != right.id) {
2501 uint32_t left_start = ts_node_start_byte(left);
2502 uint32_t right_start = ts_node_start_byte(right);
2503 if (left_start < right_start) return -1;
2504 if (left_start > right_start) return 1;
2505 uint32_t left_node_count = ts_node_end_byte(left);
2506 uint32_t right_node_count = ts_node_end_byte(right);
2507 if (left_node_count > right_node_count) return -1;
2508 if (left_node_count < right_node_count) return 1;
2509 }
2510 return 0;
2511 }
2512
2513 // Determine if either state contains a superset of the other state's captures.
ts_query_cursor__compare_captures(TSQueryCursor * self,QueryState * left_state,QueryState * right_state,bool * left_contains_right,bool * right_contains_left)2514 void ts_query_cursor__compare_captures(
2515 TSQueryCursor *self,
2516 QueryState *left_state,
2517 QueryState *right_state,
2518 bool *left_contains_right,
2519 bool *right_contains_left
2520 ) {
2521 const CaptureList *left_captures = capture_list_pool_get(
2522 &self->capture_list_pool,
2523 left_state->capture_list_id
2524 );
2525 const CaptureList *right_captures = capture_list_pool_get(
2526 &self->capture_list_pool,
2527 right_state->capture_list_id
2528 );
2529 *left_contains_right = true;
2530 *right_contains_left = true;
2531 unsigned i = 0, j = 0;
2532 for (;;) {
2533 if (i < left_captures->size) {
2534 if (j < right_captures->size) {
2535 TSQueryCapture *left = &left_captures->contents[i];
2536 TSQueryCapture *right = &right_captures->contents[j];
2537 if (left->node.id == right->node.id && left->index == right->index) {
2538 i++;
2539 j++;
2540 } else {
2541 switch (ts_query_cursor__compare_nodes(left->node, right->node)) {
2542 case -1:
2543 *right_contains_left = false;
2544 i++;
2545 break;
2546 case 1:
2547 *left_contains_right = false;
2548 j++;
2549 break;
2550 default:
2551 *right_contains_left = false;
2552 *left_contains_right = false;
2553 i++;
2554 j++;
2555 break;
2556 }
2557 }
2558 } else {
2559 *right_contains_left = false;
2560 break;
2561 }
2562 } else {
2563 if (j < right_captures->size) {
2564 *left_contains_right = false;
2565 }
2566 break;
2567 }
2568 }
2569 }
2570
2571 #ifdef DEBUG_EXECUTE_QUERY
2572 #define LOG(...) fprintf(stderr, __VA_ARGS__)
2573 #else
2574 #define LOG(...)
2575 #endif
2576
ts_query_cursor__add_state(TSQueryCursor * self,const PatternEntry * pattern)2577 static void ts_query_cursor__add_state(
2578 TSQueryCursor *self,
2579 const PatternEntry *pattern
2580 ) {
2581 QueryStep *step = &self->query->steps.contents[pattern->step_index];
2582 uint32_t start_depth = self->depth - step->depth;
2583
2584 // Keep the states array in ascending order of start_depth and pattern_index,
2585 // so that it can be processed more efficiently elsewhere. Usually, there is
2586 // no work to do here because of two facts:
2587 // * States with lower start_depth are naturally added first due to the
2588 // order in which nodes are visited.
2589 // * Earlier patterns are naturally added first because of the ordering of the
2590 // pattern_map data structure that's used to initiate matches.
2591 //
2592 // This loop is only needed in cases where two conditions hold:
2593 // * A pattern consists of more than one sibling node, so that its states
2594 // remain in progress after exiting the node that started the match.
2595 // * The first node in the pattern matches against multiple nodes at the
2596 // same depth.
2597 //
2598 // An example of this is the pattern '((comment)* (function))'. If multiple
2599 // `comment` nodes appear in a row, then we may initiate a new state for this
2600 // pattern while another state for the same pattern is already in progress.
2601 // If there are multiple patterns like this in a query, then this loop will
2602 // need to execute in order to keep the states ordered by pattern_index.
2603 uint32_t index = self->states.size;
2604 while (index > 0) {
2605 QueryState *prev_state = &self->states.contents[index - 1];
2606 if (prev_state->start_depth < start_depth) break;
2607 if (prev_state->start_depth == start_depth) {
2608 // Avoid inserting an unnecessary duplicate state, which would be
2609 // immediately pruned by the longest-match criteria.
2610 if (
2611 prev_state->pattern_index == pattern->pattern_index &&
2612 prev_state->step_index == pattern->step_index
2613 ) return;
2614 if (prev_state->pattern_index <= pattern->pattern_index) break;
2615 }
2616 index--;
2617 }
2618
2619 LOG(
2620 " start state. pattern:%u, step:%u\n",
2621 pattern->pattern_index,
2622 pattern->step_index
2623 );
2624 array_insert(&self->states, index, ((QueryState) {
2625 .id = UINT32_MAX,
2626 .capture_list_id = NONE,
2627 .step_index = pattern->step_index,
2628 .pattern_index = pattern->pattern_index,
2629 .start_depth = start_depth,
2630 .consumed_capture_count = 0,
2631 .seeking_immediate_match = true,
2632 .has_in_progress_alternatives = false,
2633 .needs_parent = step->depth == 1,
2634 .dead = false,
2635 }));
2636 }
2637
2638 // Acquire a capture list for this state. If there are no capture lists left in the
2639 // pool, this will steal the capture list from another existing state, and mark that
2640 // other state as 'dead'.
ts_query_cursor__prepare_to_capture(TSQueryCursor * self,QueryState * state,unsigned state_index_to_preserve)2641 static CaptureList *ts_query_cursor__prepare_to_capture(
2642 TSQueryCursor *self,
2643 QueryState *state,
2644 unsigned state_index_to_preserve
2645 ) {
2646 if (state->capture_list_id == NONE) {
2647 state->capture_list_id = capture_list_pool_acquire(&self->capture_list_pool);
2648
2649 // If there are no capture lists left in the pool, then terminate whichever
2650 // state has captured the earliest node in the document, and steal its
2651 // capture list.
2652 if (state->capture_list_id == NONE) {
2653 self->did_exceed_match_limit = true;
2654 uint32_t state_index, byte_offset, pattern_index;
2655 if (
2656 ts_query_cursor__first_in_progress_capture(
2657 self,
2658 &state_index,
2659 &byte_offset,
2660 &pattern_index,
2661 NULL
2662 ) &&
2663 state_index != state_index_to_preserve
2664 ) {
2665 LOG(
2666 " abandon state. index:%u, pattern:%u, offset:%u.\n",
2667 state_index, pattern_index, byte_offset
2668 );
2669 QueryState *other_state = &self->states.contents[state_index];
2670 state->capture_list_id = other_state->capture_list_id;
2671 other_state->capture_list_id = NONE;
2672 other_state->dead = true;
2673 CaptureList *list = capture_list_pool_get_mut(
2674 &self->capture_list_pool,
2675 state->capture_list_id
2676 );
2677 array_clear(list);
2678 return list;
2679 } else {
2680 LOG(" ran out of capture lists");
2681 return NULL;
2682 }
2683 }
2684 }
2685 return capture_list_pool_get_mut(&self->capture_list_pool, state->capture_list_id);
2686 }
2687
ts_query_cursor__capture(TSQueryCursor * self,QueryState * state,QueryStep * step,TSNode node)2688 static void ts_query_cursor__capture(
2689 TSQueryCursor *self,
2690 QueryState *state,
2691 QueryStep *step,
2692 TSNode node
2693 ) {
2694 if (state->dead) return;
2695 CaptureList *capture_list = ts_query_cursor__prepare_to_capture(self, state, UINT32_MAX);
2696 if (!capture_list) {
2697 state->dead = true;
2698 return;
2699 }
2700
2701 for (unsigned j = 0; j < MAX_STEP_CAPTURE_COUNT; j++) {
2702 uint16_t capture_id = step->capture_ids[j];
2703 if (step->capture_ids[j] == NONE) break;
2704 array_push(capture_list, ((TSQueryCapture) { node, capture_id }));
2705 LOG(
2706 " capture node. type:%s, pattern:%u, capture_id:%u, capture_count:%u\n",
2707 ts_node_type(node),
2708 state->pattern_index,
2709 capture_id,
2710 capture_list->size
2711 );
2712 }
2713 }
2714
2715 // Duplicate the given state and insert the newly-created state immediately after
2716 // the given state in the `states` array. Ensures that the given state reference is
2717 // still valid, even if the states array is reallocated.
ts_query_cursor__copy_state(TSQueryCursor * self,QueryState ** state_ref)2718 static QueryState *ts_query_cursor__copy_state(
2719 TSQueryCursor *self,
2720 QueryState **state_ref
2721 ) {
2722 const QueryState *state = *state_ref;
2723 uint32_t state_index = state - self->states.contents;
2724 QueryState copy = *state;
2725 copy.capture_list_id = NONE;
2726
2727 // If the state has captures, copy its capture list.
2728 if (state->capture_list_id != NONE) {
2729 CaptureList *new_captures = ts_query_cursor__prepare_to_capture(self, ©, state_index);
2730 if (!new_captures) return NULL;
2731 const CaptureList *old_captures = capture_list_pool_get(
2732 &self->capture_list_pool,
2733 state->capture_list_id
2734 );
2735 array_push_all(new_captures, old_captures);
2736 }
2737
2738 array_insert(&self->states, state_index + 1, copy);
2739 *state_ref = &self->states.contents[state_index];
2740 return &self->states.contents[state_index + 1];
2741 }
2742
2743 // Walk the tree, processing patterns until at least one pattern finishes,
2744 // If one or more patterns finish, return `true` and store their states in the
2745 // `finished_states` array. Multiple patterns can finish on the same node. If
2746 // there are no more matches, return `false`.
ts_query_cursor__advance(TSQueryCursor * self,bool stop_on_definite_step)2747 static inline bool ts_query_cursor__advance(
2748 TSQueryCursor *self,
2749 bool stop_on_definite_step
2750 ) {
2751 bool did_match = false;
2752 for (;;) {
2753 if (self->halted) {
2754 while (self->states.size > 0) {
2755 QueryState state = array_pop(&self->states);
2756 capture_list_pool_release(
2757 &self->capture_list_pool,
2758 state.capture_list_id
2759 );
2760 }
2761 }
2762
2763 if (did_match || self->halted) return did_match;
2764
2765 // Exit the current node.
2766 if (self->ascending) {
2767 LOG(
2768 "leave node. depth:%u, type:%s\n",
2769 self->depth,
2770 ts_node_type(ts_tree_cursor_current_node(&self->cursor))
2771 );
2772
2773 // Leave this node by stepping to its next sibling or to its parent.
2774 if (ts_tree_cursor_goto_next_sibling(&self->cursor)) {
2775 self->ascending = false;
2776 } else if (ts_tree_cursor_goto_parent(&self->cursor)) {
2777 self->depth--;
2778 } else {
2779 LOG("halt at root\n");
2780 self->halted = true;
2781 }
2782
2783 // After leaving a node, remove any states that cannot make further progress.
2784 uint32_t deleted_count = 0;
2785 for (unsigned i = 0, n = self->states.size; i < n; i++) {
2786 QueryState *state = &self->states.contents[i];
2787 QueryStep *step = &self->query->steps.contents[state->step_index];
2788
2789 // If a state completed its pattern inside of this node, but was deferred from finishing
2790 // in order to search for longer matches, mark it as finished.
2791 if (step->depth == PATTERN_DONE_MARKER) {
2792 if (state->start_depth > self->depth || self->halted) {
2793 LOG(" finish pattern %u\n", state->pattern_index);
2794 array_push(&self->finished_states, *state);
2795 did_match = true;
2796 deleted_count++;
2797 continue;
2798 }
2799 }
2800
2801 // If a state needed to match something within this node, then remove that state
2802 // as it has failed to match.
2803 else if ((uint32_t)state->start_depth + (uint32_t)step->depth > self->depth) {
2804 LOG(
2805 " failed to match. pattern:%u, step:%u\n",
2806 state->pattern_index,
2807 state->step_index
2808 );
2809 capture_list_pool_release(
2810 &self->capture_list_pool,
2811 state->capture_list_id
2812 );
2813 deleted_count++;
2814 continue;
2815 }
2816
2817 if (deleted_count > 0) {
2818 self->states.contents[i - deleted_count] = *state;
2819 }
2820 }
2821 self->states.size -= deleted_count;
2822 }
2823
2824 // Enter a new node.
2825 else {
2826 // Get the properties of the current node.
2827 TSNode node = ts_tree_cursor_current_node(&self->cursor);
2828 TSNode parent_node = ts_tree_cursor_parent_node(&self->cursor);
2829 TSSymbol symbol = ts_node_symbol(node);
2830 bool is_named = ts_node_is_named(node);
2831 bool has_later_siblings;
2832 bool has_later_named_siblings;
2833 bool can_have_later_siblings_with_this_field;
2834 TSFieldId field_id = 0;
2835 TSSymbol supertypes[8] = {0};
2836 unsigned supertype_count = 8;
2837 ts_tree_cursor_current_status(
2838 &self->cursor,
2839 &field_id,
2840 &has_later_siblings,
2841 &has_later_named_siblings,
2842 &can_have_later_siblings_with_this_field,
2843 supertypes,
2844 &supertype_count
2845 );
2846 LOG(
2847 "enter node. depth:%u, type:%s, field:%s, row:%u state_count:%u, finished_state_count:%u\n",
2848 self->depth,
2849 ts_node_type(node),
2850 ts_language_field_name_for_id(self->query->language, field_id),
2851 ts_node_start_point(node).row,
2852 self->states.size,
2853 self->finished_states.size
2854 );
2855
2856 bool node_intersects_range = (
2857 ts_node_end_byte(node) > self->start_byte &&
2858 ts_node_start_byte(node) < self->end_byte &&
2859 point_gt(ts_node_end_point(node), self->start_point) &&
2860 point_lt(ts_node_start_point(node), self->end_point)
2861 );
2862
2863 bool parent_intersects_range = ts_node_is_null(parent_node) || (
2864 ts_node_end_byte(parent_node) > self->start_byte &&
2865 ts_node_start_byte(parent_node) < self->end_byte &&
2866 point_gt(ts_node_end_point(parent_node), self->start_point) &&
2867 point_lt(ts_node_start_point(parent_node), self->end_point)
2868 );
2869
2870 // Add new states for any patterns whose root node is a wildcard.
2871 for (unsigned i = 0; i < self->query->wildcard_root_pattern_count; i++) {
2872 PatternEntry *pattern = &self->query->pattern_map.contents[i];
2873
2874 // If this node matches the first step of the pattern, then add a new
2875 // state at the start of this pattern.
2876 QueryStep *step = &self->query->steps.contents[pattern->step_index];
2877 if (
2878 (node_intersects_range || (!pattern->is_rooted && parent_intersects_range)) &&
2879 (!step->field || field_id == step->field) &&
2880 (!step->supertype_symbol || supertype_count > 0)
2881 ) {
2882 ts_query_cursor__add_state(self, pattern);
2883 }
2884 }
2885
2886 // Add new states for any patterns whose root node matches this node.
2887 unsigned i;
2888 if (ts_query__pattern_map_search(self->query, symbol, &i)) {
2889 PatternEntry *pattern = &self->query->pattern_map.contents[i];
2890
2891 QueryStep *step = &self->query->steps.contents[pattern->step_index];
2892 do {
2893 // If this node matches the first step of the pattern, then add a new
2894 // state at the start of this pattern.
2895 if (
2896 (node_intersects_range || (!pattern->is_rooted && parent_intersects_range)) &&
2897 (!step->field || field_id == step->field)
2898 ) {
2899 ts_query_cursor__add_state(self, pattern);
2900 }
2901
2902 // Advance to the next pattern whose root node matches this node.
2903 i++;
2904 if (i == self->query->pattern_map.size) break;
2905 pattern = &self->query->pattern_map.contents[i];
2906 step = &self->query->steps.contents[pattern->step_index];
2907 } while (step->symbol == symbol);
2908 }
2909
2910 // Update all of the in-progress states with current node.
2911 for (unsigned i = 0, copy_count = 0; i < self->states.size; i += 1 + copy_count) {
2912 QueryState *state = &self->states.contents[i];
2913 QueryStep *step = &self->query->steps.contents[state->step_index];
2914 state->has_in_progress_alternatives = false;
2915 copy_count = 0;
2916
2917 // Check that the node matches all of the criteria for the next
2918 // step of the pattern.
2919 if ((uint32_t)state->start_depth + (uint32_t)step->depth != self->depth) continue;
2920
2921 // Determine if this node matches this step of the pattern, and also
2922 // if this node can have later siblings that match this step of the
2923 // pattern.
2924 bool node_does_match = false;
2925 if (step->symbol == WILDCARD_SYMBOL) {
2926 node_does_match = is_named || !step->is_named;
2927 } else {
2928 node_does_match = symbol == step->symbol;
2929 }
2930 bool later_sibling_can_match = has_later_siblings;
2931 if ((step->is_immediate && is_named) || state->seeking_immediate_match) {
2932 later_sibling_can_match = false;
2933 }
2934 if (step->is_last_child && has_later_named_siblings) {
2935 node_does_match = false;
2936 }
2937 if (step->supertype_symbol) {
2938 bool has_supertype = false;
2939 for (unsigned j = 0; j < supertype_count; j++) {
2940 if (supertypes[j] == step->supertype_symbol) {
2941 has_supertype = true;
2942 break;
2943 }
2944 }
2945 if (!has_supertype) node_does_match = false;
2946 }
2947 if (step->field) {
2948 if (step->field == field_id) {
2949 if (!can_have_later_siblings_with_this_field) {
2950 later_sibling_can_match = false;
2951 }
2952 } else {
2953 node_does_match = false;
2954 }
2955 }
2956
2957 if (step->negated_field_list_id) {
2958 TSFieldId *negated_field_ids = &self->query->negated_fields.contents[step->negated_field_list_id];
2959 for (;;) {
2960 TSFieldId negated_field_id = *negated_field_ids;
2961 if (negated_field_id) {
2962 negated_field_ids++;
2963 if (ts_node_child_by_field_id(node, negated_field_id).id) {
2964 node_does_match = false;
2965 break;
2966 }
2967 } else {
2968 break;
2969 }
2970 }
2971 }
2972
2973 // Remove states immediately if it is ever clear that they cannot match.
2974 if (!node_does_match) {
2975 if (!later_sibling_can_match) {
2976 LOG(
2977 " discard state. pattern:%u, step:%u\n",
2978 state->pattern_index,
2979 state->step_index
2980 );
2981 capture_list_pool_release(
2982 &self->capture_list_pool,
2983 state->capture_list_id
2984 );
2985 array_erase(&self->states, i);
2986 i--;
2987 }
2988 continue;
2989 }
2990
2991 // Some patterns can match their root node in multiple ways, capturing different
2992 // children. If this pattern step could match later children within the same
2993 // parent, then this query state cannot simply be updated in place. It must be
2994 // split into two states: one that matches this node, and one which skips over
2995 // this node, to preserve the possibility of matching later siblings.
2996 if (later_sibling_can_match && (
2997 step->contains_captures ||
2998 ts_query__step_is_fallible(self->query, state->step_index)
2999 )) {
3000 if (ts_query_cursor__copy_state(self, &state)) {
3001 LOG(
3002 " split state for capture. pattern:%u, step:%u\n",
3003 state->pattern_index,
3004 state->step_index
3005 );
3006 copy_count++;
3007 }
3008 }
3009
3010 // If this pattern started with a wildcard, such that the pattern map
3011 // actually points to the *second* step of the pattern, then check
3012 // that the node has a parent, and capture the parent node if necessary.
3013 if (state->needs_parent) {
3014 TSNode parent = ts_tree_cursor_parent_node(&self->cursor);
3015 if (ts_node_is_null(parent)) {
3016 LOG(" missing parent node\n");
3017 state->dead = true;
3018 } else {
3019 state->needs_parent = false;
3020 QueryStep *skipped_wildcard_step = step;
3021 do {
3022 skipped_wildcard_step--;
3023 } while (
3024 skipped_wildcard_step->is_dead_end ||
3025 skipped_wildcard_step->is_pass_through ||
3026 skipped_wildcard_step->depth > 0
3027 );
3028 if (skipped_wildcard_step->capture_ids[0] != NONE) {
3029 LOG(" capture wildcard parent\n");
3030 ts_query_cursor__capture(
3031 self,
3032 state,
3033 skipped_wildcard_step,
3034 parent
3035 );
3036 }
3037 }
3038 }
3039
3040 // If the current node is captured in this pattern, add it to the capture list.
3041 if (step->capture_ids[0] != NONE) {
3042 ts_query_cursor__capture(self, state, step, node);
3043 }
3044
3045 if (state->dead) {
3046 array_erase(&self->states, i);
3047 i--;
3048 continue;
3049 }
3050
3051 // Advance this state to the next step of its pattern.
3052 state->step_index++;
3053 state->seeking_immediate_match = false;
3054 LOG(
3055 " advance state. pattern:%u, step:%u\n",
3056 state->pattern_index,
3057 state->step_index
3058 );
3059
3060 QueryStep *next_step = &self->query->steps.contents[state->step_index];
3061 if (stop_on_definite_step && next_step->root_pattern_guaranteed) did_match = true;
3062
3063 // If this state's next step has an alternative step, then copy the state in order
3064 // to pursue both alternatives. The alternative step itself may have an alternative,
3065 // so this is an interative process.
3066 unsigned end_index = i + 1;
3067 for (unsigned j = i; j < end_index; j++) {
3068 QueryState *state = &self->states.contents[j];
3069 QueryStep *next_step = &self->query->steps.contents[state->step_index];
3070 if (next_step->alternative_index != NONE) {
3071 // A "dead-end" step exists only to add a non-sequential jump into the step sequence,
3072 // via its alternative index. When a state reaches a dead-end step, it jumps straight
3073 // to the step's alternative.
3074 if (next_step->is_dead_end) {
3075 state->step_index = next_step->alternative_index;
3076 j--;
3077 continue;
3078 }
3079
3080 // A "pass-through" step exists only to add a branch into the step sequence,
3081 // via its alternative_index. When a state reaches a pass-through step, it splits
3082 // in order to process the alternative step, and then it advances to the next step.
3083 if (next_step->is_pass_through) {
3084 state->step_index++;
3085 j--;
3086 }
3087
3088 QueryState *copy = ts_query_cursor__copy_state(self, &state);
3089 if (copy) {
3090 LOG(
3091 " split state for branch. pattern:%u, from_step:%u, to_step:%u, immediate:%d, capture_count: %u\n",
3092 copy->pattern_index,
3093 copy->step_index,
3094 next_step->alternative_index,
3095 next_step->alternative_is_immediate,
3096 capture_list_pool_get(&self->capture_list_pool, copy->capture_list_id)->size
3097 );
3098 end_index++;
3099 copy_count++;
3100 copy->step_index = next_step->alternative_index;
3101 if (next_step->alternative_is_immediate) {
3102 copy->seeking_immediate_match = true;
3103 }
3104 }
3105 }
3106 }
3107 }
3108
3109 for (unsigned i = 0; i < self->states.size; i++) {
3110 QueryState *state = &self->states.contents[i];
3111 if (state->dead) {
3112 array_erase(&self->states, i);
3113 i--;
3114 continue;
3115 }
3116
3117 // Enfore the longest-match criteria. When a query pattern contains optional or
3118 // repeated nodes, this is necessary to avoid multiple redundant states, where
3119 // one state has a strict subset of another state's captures.
3120 bool did_remove = false;
3121 for (unsigned j = i + 1; j < self->states.size; j++) {
3122 QueryState *other_state = &self->states.contents[j];
3123
3124 // Query states are kept in ascending order of start_depth and pattern_index.
3125 // Since the longest-match criteria is only used for deduping matches of the same
3126 // pattern and root node, we only need to perform pairwise comparisons within a
3127 // small slice of the states array.
3128 if (
3129 other_state->start_depth != state->start_depth ||
3130 other_state->pattern_index != state->pattern_index
3131 ) break;
3132
3133 bool left_contains_right, right_contains_left;
3134 ts_query_cursor__compare_captures(
3135 self,
3136 state,
3137 other_state,
3138 &left_contains_right,
3139 &right_contains_left
3140 );
3141 if (left_contains_right) {
3142 if (state->step_index == other_state->step_index) {
3143 LOG(
3144 " drop shorter state. pattern: %u, step_index: %u\n",
3145 state->pattern_index,
3146 state->step_index
3147 );
3148 capture_list_pool_release(&self->capture_list_pool, other_state->capture_list_id);
3149 array_erase(&self->states, j);
3150 j--;
3151 continue;
3152 }
3153 other_state->has_in_progress_alternatives = true;
3154 }
3155 if (right_contains_left) {
3156 if (state->step_index == other_state->step_index) {
3157 LOG(
3158 " drop shorter state. pattern: %u, step_index: %u\n",
3159 state->pattern_index,
3160 state->step_index
3161 );
3162 capture_list_pool_release(&self->capture_list_pool, state->capture_list_id);
3163 array_erase(&self->states, i);
3164 i--;
3165 did_remove = true;
3166 break;
3167 }
3168 state->has_in_progress_alternatives = true;
3169 }
3170 }
3171
3172 // If the state is at the end of its pattern, remove it from the list
3173 // of in-progress states and add it to the list of finished states.
3174 if (!did_remove) {
3175 LOG(
3176 " keep state. pattern: %u, start_depth: %u, step_index: %u, capture_count: %u\n",
3177 state->pattern_index,
3178 state->start_depth,
3179 state->step_index,
3180 capture_list_pool_get(&self->capture_list_pool, state->capture_list_id)->size
3181 );
3182 QueryStep *next_step = &self->query->steps.contents[state->step_index];
3183 if (next_step->depth == PATTERN_DONE_MARKER) {
3184 if (state->has_in_progress_alternatives) {
3185 LOG(" defer finishing pattern %u\n", state->pattern_index);
3186 } else {
3187 LOG(" finish pattern %u\n", state->pattern_index);
3188 array_push(&self->finished_states, *state);
3189 array_erase(&self->states, state - self->states.contents);
3190 did_match = true;
3191 i--;
3192 }
3193 }
3194 }
3195 }
3196
3197 // When the current node ends prior to the desired start offset,
3198 // only descend for the purpose of continuing in-progress matches.
3199 bool should_descend = node_intersects_range;
3200 if (!should_descend) {
3201 for (unsigned i = 0; i < self->states.size; i++) {
3202 QueryState *state = &self->states.contents[i];;
3203 QueryStep *next_step = &self->query->steps.contents[state->step_index];
3204 if (
3205 next_step->depth != PATTERN_DONE_MARKER &&
3206 state->start_depth + next_step->depth > self->depth
3207 ) {
3208 should_descend = true;
3209 break;
3210 }
3211 }
3212 }
3213
3214 if (!should_descend) {
3215 LOG(
3216 " not descending. node end byte: %u, start byte: %u\n",
3217 ts_node_end_byte(node),
3218 self->start_byte
3219 );
3220 }
3221
3222 if (should_descend && ts_tree_cursor_goto_first_child(&self->cursor)) {
3223 self->depth++;
3224 } else {
3225 self->ascending = true;
3226 }
3227 }
3228 }
3229 }
3230
ts_query_cursor_next_match(TSQueryCursor * self,TSQueryMatch * match)3231 bool ts_query_cursor_next_match(
3232 TSQueryCursor *self,
3233 TSQueryMatch *match
3234 ) {
3235 if (self->finished_states.size == 0) {
3236 if (!ts_query_cursor__advance(self, false)) {
3237 return false;
3238 }
3239 }
3240
3241 QueryState *state = &self->finished_states.contents[0];
3242 if (state->id == UINT32_MAX) state->id = self->next_state_id++;
3243 match->id = state->id;
3244 match->pattern_index = state->pattern_index;
3245 const CaptureList *captures = capture_list_pool_get(
3246 &self->capture_list_pool,
3247 state->capture_list_id
3248 );
3249 match->captures = captures->contents;
3250 match->capture_count = captures->size;
3251 capture_list_pool_release(&self->capture_list_pool, state->capture_list_id);
3252 array_erase(&self->finished_states, 0);
3253 return true;
3254 }
3255
ts_query_cursor_remove_match(TSQueryCursor * self,uint32_t match_id)3256 void ts_query_cursor_remove_match(
3257 TSQueryCursor *self,
3258 uint32_t match_id
3259 ) {
3260 for (unsigned i = 0; i < self->finished_states.size; i++) {
3261 const QueryState *state = &self->finished_states.contents[i];
3262 if (state->id == match_id) {
3263 capture_list_pool_release(
3264 &self->capture_list_pool,
3265 state->capture_list_id
3266 );
3267 array_erase(&self->finished_states, i);
3268 return;
3269 }
3270 }
3271 }
3272
ts_query_cursor_next_capture(TSQueryCursor * self,TSQueryMatch * match,uint32_t * capture_index)3273 bool ts_query_cursor_next_capture(
3274 TSQueryCursor *self,
3275 TSQueryMatch *match,
3276 uint32_t *capture_index
3277 ) {
3278 // The goal here is to return captures in order, even though they may not
3279 // be discovered in order, because patterns can overlap. Search for matches
3280 // until there is a finished capture that is before any unfinished capture.
3281 for (;;) {
3282 // First, find the earliest capture in an unfinished match.
3283 uint32_t first_unfinished_capture_byte;
3284 uint32_t first_unfinished_pattern_index;
3285 uint32_t first_unfinished_state_index;
3286 bool first_unfinished_state_is_definite = false;
3287 ts_query_cursor__first_in_progress_capture(
3288 self,
3289 &first_unfinished_state_index,
3290 &first_unfinished_capture_byte,
3291 &first_unfinished_pattern_index,
3292 &first_unfinished_state_is_definite
3293 );
3294
3295 // Then find the earliest capture in a finished match. It must occur
3296 // before the first capture in an *unfinished* match.
3297 QueryState *first_finished_state = NULL;
3298 uint32_t first_finished_capture_byte = first_unfinished_capture_byte;
3299 uint32_t first_finished_pattern_index = first_unfinished_pattern_index;
3300 for (unsigned i = 0; i < self->finished_states.size;) {
3301 QueryState *state = &self->finished_states.contents[i];
3302 const CaptureList *captures = capture_list_pool_get(
3303 &self->capture_list_pool,
3304 state->capture_list_id
3305 );
3306
3307 // Remove states whose captures are all consumed.
3308 if (state->consumed_capture_count >= captures->size) {
3309 capture_list_pool_release(
3310 &self->capture_list_pool,
3311 state->capture_list_id
3312 );
3313 array_erase(&self->finished_states, i);
3314 continue;
3315 }
3316
3317 // Skip captures that precede the cursor's start byte.
3318 TSNode node = captures->contents[state->consumed_capture_count].node;
3319 if (ts_node_end_byte(node) <= self->start_byte) {
3320 state->consumed_capture_count++;
3321 continue;
3322 }
3323
3324 uint32_t node_start_byte = ts_node_start_byte(node);
3325 if (
3326 node_start_byte < first_finished_capture_byte ||
3327 (
3328 node_start_byte == first_finished_capture_byte &&
3329 state->pattern_index < first_finished_pattern_index
3330 )
3331 ) {
3332 first_finished_state = state;
3333 first_finished_capture_byte = node_start_byte;
3334 first_finished_pattern_index = state->pattern_index;
3335 }
3336 i++;
3337 }
3338
3339 // If there is finished capture that is clearly before any unfinished
3340 // capture, then return its match, and its capture index. Internally
3341 // record the fact that the capture has been 'consumed'.
3342 QueryState *state;
3343 if (first_finished_state) {
3344 state = first_finished_state;
3345 } else if (first_unfinished_state_is_definite) {
3346 state = &self->states.contents[first_unfinished_state_index];
3347 } else {
3348 state = NULL;
3349 }
3350
3351 if (state) {
3352 if (state->id == UINT32_MAX) state->id = self->next_state_id++;
3353 match->id = state->id;
3354 match->pattern_index = state->pattern_index;
3355 const CaptureList *captures = capture_list_pool_get(
3356 &self->capture_list_pool,
3357 state->capture_list_id
3358 );
3359 match->captures = captures->contents;
3360 match->capture_count = captures->size;
3361 *capture_index = state->consumed_capture_count;
3362 state->consumed_capture_count++;
3363 return true;
3364 }
3365
3366 if (capture_list_pool_is_empty(&self->capture_list_pool)) {
3367 LOG(
3368 " abandon state. index:%u, pattern:%u, offset:%u.\n",
3369 first_unfinished_state_index,
3370 first_unfinished_pattern_index,
3371 first_unfinished_capture_byte
3372 );
3373 capture_list_pool_release(
3374 &self->capture_list_pool,
3375 self->states.contents[first_unfinished_state_index].capture_list_id
3376 );
3377 array_erase(&self->states, first_unfinished_state_index);
3378 }
3379
3380 // If there are no finished matches that are ready to be returned, then
3381 // continue finding more matches.
3382 if (
3383 !ts_query_cursor__advance(self, true) &&
3384 self->finished_states.size == 0
3385 ) return false;
3386 }
3387 }
3388
3389 #undef LOG
3390