1 /* Bluefish HTML Editor
2 * bftextview2_patcompile.c
3 *
4 * Copyright (C) 2008,2009,2011,2013,2014,2015 Olivier Sessink
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 /*#define ENABLE_PRINT_DFA*/
21
22 /* for the design docs see bftextview2.h */
23 #include <string.h>
24 #include "bluefish.h"
25 #include "bftextview2_private.h"
26 #include "bftextview2_patcompile.h"
27 #include "bftextview2_langmgr.h"
28 #include "bf_lib.h"
29 /*
30 we do regex pattern support as well.
31
32 we don't do everything that pcre or posix regex patterns can
33 do - these engines have features that cannot be done in a DFA (such as backtracking)
34
35 There are several ways in which regex patterns can be simplified:
36
37 a(bc)+ == abc(abc)*
38 [a-z]+ == [a-z][a-z]*
39 OR reverse:
40 a(bc)* == (a|a[bc]+)
41 a[a-z]* == (a|a[a-z]+)
42
43 a(bc)? = (a|abc)
44 a[a-z]? = (a|a[a-z])
45
46 a(b|c)d == (abd|acd)
47
48 [^a] == [b-z] (and all other ascii characters, for simplification I just use the alphabet)
49
50 so at least we need to be able to compile:
51 the OR construct: (|)
52 the zero-or-more *
53 the character list [a-z]
54
55 */
56
57 #ifdef ENABLE_PRINT_DFA
58 void print_DFA_subset(Tscantable * st, gint16 context, char *chars);
59 #endif
60
61 static gint
pointersort_compare(gconstpointer a,gconstpointer b)62 pointersort_compare(gconstpointer a, gconstpointer b)
63 {
64 return a > b ? -1 : (a == b ? 0 : 1);
65 }
66
67 /* returns a list of tags that are used in this language */
68 GList *
bftextview2_scantable_rematch_highlights(Tscantable * st,const gchar * lang)69 bftextview2_scantable_rematch_highlights(Tscantable * st, const gchar * lang)
70 {
71 int i = 0;
72 GList *retlist = NULL, *tmplist;
73 gpointer temp = NULL;
74 for (i = 0; i < (st->contexts->len); i++) {
75 /* g_print("context %d",i);
76 g_print(" has highlight %s\n",g_array_index(st->contexts, Tcontext, i).contexthighlight);*/
77 if (g_array_index(st->contexts, Tcontext, i).contexthighlight) {
78 g_array_index(st->contexts, Tcontext, i).contexttag =
79 langmrg_lookup_tag_highlight(lang, g_array_index(st->contexts, Tcontext, i).contexthighlight);
80 if (g_array_index(st->contexts, Tcontext, i).contexttag) {
81 retlist = g_list_prepend(retlist, g_array_index(st->contexts, Tcontext, i).contexttag);
82 } else {
83 g_print("Possible error in language file, no textstyle found for context highlight %s\n",
84 g_array_index(st->contexts, Tcontext, i).contexthighlight);
85 }
86 }
87 }
88 for (i = 0; i < (st->matches->len); i++) {
89 /*g_print("pattern %d",i);
90 g_print(" (%s)",g_array_index(st->matches, Tpattern, i).pattern);
91 g_print(" has selfhighlight %s and blockhighlight %s\n",g_array_index(st->matches, Tpattern, i).selfhighlight,g_array_index(st->matches, Tpattern, i).blockhighlight); */
92 if (g_array_index(st->matches, Tpattern, i).selfhighlight) {
93 g_array_index(st->matches, Tpattern, i).selftag =
94 langmrg_lookup_tag_highlight(lang, g_array_index(st->matches, Tpattern, i).selfhighlight);
95 if (g_array_index(st->matches, Tpattern, i).selftag)
96 retlist = g_list_prepend(retlist, g_array_index(st->matches, Tpattern, i).selftag);
97 else
98 g_print("Possible error in language file, no textstyle found for highlight %s\n",
99 g_array_index(st->matches, Tpattern, i).selfhighlight);
100 }
101 /*if (g_array_index(st->matches, Tpattern, i).blockhighlight) {
102 g_array_index(st->matches, Tpattern, i).blocktag =
103 langmrg_lookup_tag_highlight(lang, g_array_index(st->matches, Tpattern, i).blockhighlight);
104 if (g_array_index(st->matches, Tpattern, i).blocktag)
105 retlist = g_list_prepend(retlist, g_array_index(st->matches, Tpattern, i).blocktag);
106 }*/
107 }
108 for (i = 0; i < (st->blocks->len); i++) {
109 if (g_array_index(st->blocks, Tpattern_block, i).highlight) {
110 g_array_index(st->blocks, Tpattern_block, i).tag = langmrg_lookup_tag_highlight(lang, g_array_index(st->blocks, Tpattern_block, i).highlight);
111 if (g_array_index(st->blocks, Tpattern_block, i).tag) {
112 retlist = g_list_prepend(retlist, g_array_index(st->blocks, Tpattern_block, i).tag);
113 } else {
114 g_print("Possible error in language file, no textstyle found for block highlight %s\n",
115 g_array_index(st->blocks, Tpattern_block, i).highlight);
116 }
117 }
118 }
119
120 /* now remove all duplicate tags */
121 tmplist = retlist = g_list_sort(retlist, (GCompareFunc) pointersort_compare);
122 while (tmplist) {
123 if (tmplist->data == temp) {
124 GList *tofree = tmplist;
125 tmplist = tmplist->next;
126 /* duplicate ! */
127 tofree->prev->next = tmplist;
128 if (tmplist)
129 tmplist->prev = tofree->prev;
130 g_list_free_1(tofree);
131 } else {
132 /* not duplicate */
133 temp = tmplist->data;
134 tmplist = tmplist->next;
135 }
136 }
137
138 return retlist;
139 }
140
141 /*static void print_characters(gchar *characters) {
142 int i;
143 DBG_PATCOMPILE("we have active characters ");
144 for (i=0;i<NUMSCANCHARS;i++) {
145 if (characters[i]==1)
146 DBG_PATCOMPILE("%c ",i);
147 }
148 DBG_PATCOMPILE("\n");
149 }*/
150
151 static gint
fill_characters_from_range(gchar * input,gchar * characters)152 fill_characters_from_range(gchar * input, gchar * characters)
153 {
154 gboolean reverse = 0;
155 gint i = 0;
156 if (input[i] == '^') { /* see if it is a NOT pattern */
157 reverse = 1;
158 memset(characters, 1, NUMSCANCHARS * sizeof(char));
159 i++;
160 }
161 if (input[i] == ']') { /* ] is actually part of the collection */
162 characters['['] = 1;
163 i++;
164 }
165 if (input[i] == '-') { /* - is actually part of the collection */
166 characters['-'] = 1;
167 i++;
168 }
169 while (input[i] != ']') {
170 if (input[i] == '-') { /* range all characters between the previous and the next char */
171 gchar j;
172 if (input[i + 1] == ']') { /* - is actually part of the collection */
173 characters['-'] = 1;
174 i++;
175 } else {
176 DBG_PATCOMPILE("set characters from %c to %c to %d\n", input[i - 1], input[i + 1],
177 1 - reverse);
178 for (j = input[i - 1]; j <= input[i + 1]; j++) {
179 characters[(int) j] = 1 - reverse;
180 }
181 i += 2;
182 }
183 } else {
184 DBG_PATCOMPILE("set characters %c (pos %d) to %d\n", input[i], input[i], 1 - reverse);
185 characters[(int) input[i]] = 1 - reverse;
186 i++;
187 }
188 }
189 return i;
190 }
191
192 static void
push_on_stack(GQueue * stack,gint state)193 push_on_stack(GQueue * stack, gint state)
194 {
195 GList *tmplist;
196 tmplist = stack->head;
197 while (tmplist) {
198 if (GPOINTER_TO_INT(tmplist->data) == state)
199 return;
200 tmplist = tmplist->next;
201 }
202 g_queue_push_head(stack, GINT_TO_POINTER(state));
203 }
204
205 static void
create_state_tables(Tscantable * st,gint16 context,gchar * characters,gboolean pointtoself,GQueue * positions,GQueue * newpositions,gboolean end_is_symbol)206 create_state_tables(Tscantable * st, gint16 context, gchar * characters, gboolean pointtoself,
207 GQueue * positions, GQueue * newpositions, gboolean end_is_symbol)
208 {
209 guint c, pos;
210 const guint identstate =1;
211 gint newstate = -1; /* if all characters can follow existing states we don't need any newstate
212 and thus newstate==-1 but if one or more characters in one or more states need a new state
213 it will be >0 */
214 DBG_PATCOMPILE("create_state_tables, context=%d, started with get_length(positions)=%d states, pointtoself=%d, end_is_symbol=%d\n",
215 (gint)context,g_queue_get_length(positions), pointtoself,end_is_symbol);
216 while (g_queue_get_length(positions)) {
217 pos = GPOINTER_TO_INT(g_queue_pop_head(positions));
218 /*DBG_PATCOMPILE("working on position %d, identstate=%d\n", pos, identstate);*/
219 for (c = 0; c < NUMSCANCHARS; c++) {
220 if (characters[c] == 1) {
221 DBG_PATCOMPILE("running for position %d char %c (=%d)\n",pos,c,c);
222 /* there is a bug in the pattern compiler that may compile a buggy language file without a warning:
223 if a new keyword is added, and all characters already exist there are no new states created: this function
224 will simply follow the existing states. BUT IT DOES NOT CHECK IF THERE ARE MULTIPLE WAYS TO GET TO THESE
225 EXISTING STATES. So if those states are created by a regex pattern that allows jumping to these states via
226 various other characters, you will have an overlapping pattern WITHOUT WARNING.
227
228 You can avoid this bug by having regex patterns alwats in the end of a context.
229 */
230
231
232 if (get_tablerow(st, context, pos).row[c] != 0
233 && get_tablerow(st, context, pos).row[c] != identstate) {
234 if (get_tablerow(st, context, pos).row[c] == pos) {
235 /* this state is a 'morestate', a state that points to itself, for a (regex) pattern such as a+ or a* */
236 if (pointtoself) {
237 push_on_stack(newpositions,
238 (gint) get_tablerow(st, context, pos).row[c]);
239 } else {
240 if (newstate == -1) {
241 newstate = get_table(st, context)->len;
242 DBG_PATCOMPILE("create_state_tables, create newstate %d from morestate %d\n",
243 newstate, pos);
244 g_array_set_size(get_table(st,context), get_table(st,context)->len + 1);
245 if (get_table(st,context)->len+1 >= G_MAXUINT16) {
246 g_print("Critical error in language file: state table overflow!!!!!!!!\n");
247 }
248 /* pass-on the morestate */
249 memcpy(get_tablerow(st, context, newstate).row,
250 get_tablerow(st, context, pos).row,
251 sizeof(guint16[NUMSCANCHARS]));
252 get_tablerow(st, context, pos).row[c] = newstate;
253 g_queue_push_head(newpositions, GINT_TO_POINTER(newstate));
254 } else {
255 get_tablerow(st, context, pos).row[c] = newstate;
256 }
257 /*g_print("*****************************\nBUG: we cannot parse this pattern after a regex pattern yet !!!!!!!!!!!!!\nstate tables will be incorrect!!!\n*****************************\n"); */
258 }
259 } else {
260 if (pointtoself) { /* perhaps check if we have this state on the stack already */
261 push_on_stack(positions, (gint) get_tablerow(st, context, pos).row[c]);
262 } else {
263 if (get_tablerow(st, context, pos).row[c] > pos) {
264 DBG_PATCOMPILE("points to state %d, pushing %d on newpositions\n", (gint) get_tablerow(st, context, pos).row[c], (gint) get_tablerow(st, context, pos).row[c]);
265 push_on_stack(newpositions,
266 (gint) get_tablerow(st, context, pos).row[c]);
267 } else { /* this is probably a morestate that has been passed-on several states */
268 if (newstate == -1) {
269 newstate = get_table(st, context)->len;
270 DBG_PATCOMPILE
271 ("create_state_tables, create newstate %d from morestate %d\n",
272 newstate, pos);
273 g_array_set_size(get_table(st,context), get_table(st,context)->len + 1);
274 if (get_table(st,context)->len+1 >= G_MAXUINT16) {
275 g_print("Critical error in language file: state table overflow!!!!!!!!\n");
276 }
277 /* pass-on the morestate */
278 memcpy(get_tablerow(st, context, newstate).row,
279 get_tablerow(st, context, pos).row,
280 sizeof(guint16[NUMSCANCHARS]));
281 get_tablerow(st, context, pos).row[c] = newstate;
282 g_queue_push_head(newpositions, GINT_TO_POINTER(newstate));
283 } else {
284 get_tablerow(st, context, pos).row[c] = newstate;
285 }
286 }
287
288 }
289 }
290 } else {
291 if (newstate == -1) {
292 get_tablerow(st, context, pos).row[c] = newstate = get_table(st,context)->len;
293 DBG_PATCOMPILE("create_state_tables, create newstate %d, pointtoself=%d\n", newstate,
294 pointtoself);
295 g_array_set_size(get_table(st,context), get_table(st,context)->len + 1);
296 if (get_table(st,context)->len+1 >= G_MAXUINT16) {
297 g_print("Critical error in language file: state table overflow!!!!!!!!\n");
298 }
299 if (!character_is_symbol(st, context, c)) {
300 /* normally this memcpy copies the identstate to the current state such that all symbols still point to the
301 startstate but all non-symbols point to the identstate */
302 DBG_PATCOMPILE
303 ("create_state_tables, newstate %d does not end on a symbol, init all values equal to the identstate %d\n",
304 newstate, 1);
305 memcpy(get_tablerow(st,context,newstate).row,
306 get_tablerow(st,context,identstate).row,
307 sizeof(guint16[NUMSCANCHARS]));
308 } else {
309 DBG_PATCOMPILE
310 ("create_state_tables, newstate %d ends on a symbol, so init all values with 0\n",
311 newstate);
312 }
313 g_queue_push_head(newpositions, GINT_TO_POINTER(newstate));
314 if (pointtoself) {
315 guint d;
316 for (d = 0; d < NUMSCANCHARS; d++) {
317 if (characters[d] == 1) {
318 /*DBG_PATCOMPILE("in newstate %d, character %c points to %d\n",newstate,d,newstate); */
319 get_tablerow(st,context,newstate).row[d] = newstate;
320 }
321 }
322 }
323 } else {
324 get_tablerow(st,context,pos).row[c] = newstate;
325 /* nothing to put on the stack, newstate should be on newpositions already if it is not -1 */
326 }
327 }
328 }
329 }
330 }
331 DBG_PATCOMPILE("create_state_tables, done, get_lenght(newpositions)=%d\n", g_queue_get_length(newpositions));
332 }
333
334 static void
merge_queues(GQueue * target,GQueue * src)335 merge_queues(GQueue * target, GQueue * src)
336 {
337 while (g_queue_get_length(src)) {
338 DBG_PATCOMPILE("merge queue, push state %d to queue\n", GPOINTER_TO_INT(g_queue_peek_head(src)));
339 g_queue_push_head(target, g_queue_pop_head(src));
340 }
341 DBG_PATCOMPILE("merge queue, target queue has length %d \n", g_queue_get_length(target));
342 }
343
344 static GQueue *process_regex_part(Tscantable * st, gchar * regexpart, gint16 context,
345 gboolean caseinsensitive, GQueue * inputpositions,
346 gboolean regexpart_ends_regex);
347
348 static GQueue *
run_subpatterns(Tscantable * st,gchar * regexpart,gint16 context,gboolean caseinsensitive,GQueue * inputpositions,gint * regexpartpos,gboolean regexpart_ends_regex)349 run_subpatterns(Tscantable * st, gchar * regexpart, gint16 context, gboolean caseinsensitive,
350 GQueue * inputpositions, gint * regexpartpos, gboolean regexpart_ends_regex)
351 {
352 gint j = 0;
353 gchar *target;
354 GQueue *mergednewpositions = g_queue_new();
355 target = g_strdup(®expart[*regexpartpos]); /* a very easy way to make target a buffer long enough to hold any subpattern */
356
357 while (regexpart[*regexpartpos] != '\0') {
358 DBG_PATCOMPILE("run_subpatterns, regexpart[%d]=%c\n", *regexpartpos, regexpart[*regexpartpos]);
359 if (regexpart[*regexpartpos] == '\\') {
360 if (regexpart[*regexpartpos + 1] == '|' || regexpart[*regexpartpos + 1] == ')') {
361 target[j] = regexpart[*regexpartpos + 1];
362 *regexpartpos = *regexpartpos + 1;
363 j++;
364 } else {
365 target[j] = regexpart[*regexpartpos];
366 j++;
367 }
368 } else if ((regexpart[*regexpartpos] == '|' || regexpart[*regexpartpos] == ')')) {
369 /* found a subpattern */
370 GQueue *newpositions;
371 target[j] = '\0';
372 DBG_PATCOMPILE("at regexpartpos=%d found SUBPATTERN %s\n", *regexpartpos, target);
373 newpositions =
374 process_regex_part(st, target, context, caseinsensitive, inputpositions,
375 regexpart_ends_regex);
376 merge_queues(mergednewpositions, newpositions);
377 g_queue_free(newpositions);
378 j = 0;
379
380 if (regexpart[*regexpartpos] == ')') {
381 g_free(target);
382 return mergednewpositions;
383 }
384 } else {
385 target[j] = regexpart[*regexpartpos];
386 j++;
387 }
388 *regexpartpos = *regexpartpos + 1;
389 }
390 g_free(target);
391 return mergednewpositions;
392 }
393
394 /* this function can be called recursively for subpatterns. It gets all current valid states in
395 inputpositions, and returns all valid outputstates.
396
397 regexpart_ends_regex is a boolean that is used to set the state for the last character of the regex.
398 if the last character of a pattern is a symbol, the states are different. because this function is
399 called recursively this is only relevant for the first call.
400 BUG: this is not true, for a pattern like <(a|b) the a and b are both end-patterns, but they are handled in a
401 subpattern
402
403
404 */
405 static GQueue *
process_regex_part(Tscantable * st,gchar * regexpart,gint16 context,gboolean caseinsensitive,GQueue * inputpositions,gboolean regexpart_ends_regex)406 process_regex_part(Tscantable * st, gchar * regexpart, gint16 context, gboolean caseinsensitive,
407 GQueue * inputpositions, gboolean regexpart_ends_regex)
408 {
409 gboolean escaped = FALSE;
410 gint i = 0;
411 char characters[NUMSCANCHARS];
412 GQueue *newpositions, *positions, *tmp;
413
414 positions = g_queue_copy(inputpositions);
415 newpositions = g_queue_new();
416 DBG_PATCOMPILE("process_regex_part, running part %s\n", regexpart);
417 while (1) {
418 memset(&characters, 0, NUMSCANCHARS * sizeof(char));
419 escaped = 0;
420 DBG_PATCOMPILE("start of loop, regexpart[%d]=%c, have %d positions\n", i, regexpart[i],
421 g_queue_get_length(positions));
422 if (regexpart[i] == '\0') { /* end of pattern */
423 DBG_PATCOMPILE("end of pattern, positions(%p) has %d entries\n", positions,
424 g_queue_get_length(positions));
425 g_queue_free(newpositions);
426 return positions;
427 } else {
428 if (!escaped && regexpart[i] == '\\') {
429 DBG_PATCOMPILE("found \\ at position %d:next pattern is escaped\n", i);
430 escaped = TRUE;
431 i++;
432 }
433 if (!escaped && regexpart[i] == '(') {
434 gchar *tmp;
435 gboolean subpattern_ends_regex = FALSE;
436 /* a subpattern */
437 DBG_PATCOMPILE("found subpatern start at %d\n", i);
438 i++;
439 tmp = strchr(®expart[i], ')');
440 /* BUG: this code doesn't handle an escaped \) sequence */
441 if (tmp) {
442 tmp++;
443 if (*tmp == '\0')
444 subpattern_ends_regex = TRUE;
445 }
446 newpositions =
447 run_subpatterns(st, regexpart, context, caseinsensitive, positions, &i,
448 subpattern_ends_regex);
449 DBG_PATCOMPILE("end of subpatern at %d (%c)\n", i, regexpart[i]);
450 /* BUG: if the subpattern is followed by an operator foo(bla)? this is not handled by the code !!!!!!!! */
451 } else {
452 gboolean end_is_symbol = FALSE;
453 if (!escaped) {
454 switch (regexpart[i]) {
455 case '.':
456 memset(&characters, 1, NUMSCANCHARS * sizeof(char));
457 break;
458 case '[':
459 DBG_PATCOMPILE("found range at i=%d, fill characters\n", i);
460 i += fill_characters_from_range(®expart[i + 1], characters) + 1;
461 break;
462 default:
463 characters[(int) regexpart[i]] = 1;
464 break;
465 }
466 } else { /* escaped */
467 DBG_PATCOMPILE("character %c was escaped, adding to characters\n", regexpart[i]);
468 characters[(int) regexpart[i]] = 1;
469 }
470 /* handle case insensitiveness */
471 if (caseinsensitive) {
472 gint j;
473 for (j = 'a'; j <= 'z'; j++) {
474 if (characters[j] == 1)
475 characters[j - 32] = 1;
476 if (characters[j - 32] == 1)
477 characters[j] = 1;
478 }
479 }
480 /* TODO end_is_symbol no longer used in create_state_tables() , so it can be removed*/
481 DBG_PATCOMPILE("i=%d, testing i+1 (%c) for operator\n", i, regexpart[i + 1]);
482 /*print_characters(characters); */
483 /* see if there is an operator */
484 if (regexpart[i] == '\0') {
485 DBG_PATCOMPILE("no operator, end of (sub)pattern\n");
486 create_state_tables(st, context, characters, FALSE, positions, newpositions,
487 end_is_symbol);
488 } else if (regexpart[i + 1] == '+') {
489 DBG_PATCOMPILE("handling + operator\n");
490 create_state_tables(st, context, characters, TRUE, positions, newpositions,
491 end_is_symbol);
492 i++;
493 } else if (regexpart[i + 1] == '*') {
494 GQueue *tmp = g_queue_copy(positions);
495 DBG_PATCOMPILE("handling * operator\n");
496 create_state_tables(st, context, characters, TRUE, positions, newpositions,
497 end_is_symbol);
498 merge_queues(newpositions, tmp);
499 g_queue_free(tmp);
500 i++;
501 } else if (regexpart[i + 1] == '?') {
502 GQueue *tmp = g_queue_copy(positions);
503 DBG_PATCOMPILE("handling ? operator\n");
504 create_state_tables(st, context, characters, FALSE, positions, newpositions,
505 end_is_symbol);
506 merge_queues(newpositions, tmp);
507 g_queue_free(tmp);
508 i++;
509 } else {
510 DBG_PATCOMPILE("no operator\n");
511 create_state_tables(st, context, characters, FALSE, positions, newpositions,
512 end_is_symbol);
513 }
514 }
515 g_queue_clear(positions);
516 tmp = positions;
517 positions = newpositions;
518 newpositions = tmp;
519 }
520 i++;
521 }
522 /* cannot get here */
523 }
524
525 static void
compile_limitedregex_to_DFA(Tscantable * st,gchar * input,gboolean caseinsensitive,guint16 matchnum,gint16 context,gpointer ldb)526 compile_limitedregex_to_DFA(Tscantable * st, gchar * input, gboolean caseinsensitive, guint16 matchnum,
527 gint16 context, gpointer ldb)
528 {
529 GQueue *positions, *newpositions;
530 gchar *lregex;
531 gint p;
532 positions = g_queue_new();
533
534 if (caseinsensitive) {
535 /* make complete string lowercase */
536 lregex = g_ascii_strdown(input, -1);
537 } else {
538 lregex = g_strdup(input);
539 }
540
541 g_queue_push_head(positions,
542 GINT_TO_POINTER((gint) 0));
543 DBG_PATCOMPILE("lregex=%s, positionstack has length %d\n", lregex, g_queue_get_length(positions));
544 newpositions = process_regex_part(st, lregex, context, caseinsensitive, positions, TRUE);
545 /*compile_limitedregex_to_DFA_backend(st,lregex,context,caseinsensitive,&positions); */
546 DBG_PATCOMPILE("after compiling positionstack has length %d\n", g_queue_get_length(positions));
547 while ((g_queue_get_length(newpositions))) {
548 p = GPOINTER_TO_INT(g_queue_pop_head(newpositions));
549 DBG_PATCOMPILE("mark state %d as possible end-state\n", p);
550 if (get_tablerow(st, context, p).match != 0
551 && get_tablerow(st, context, p).match != matchnum) {
552 gchar *dbstring = ldb_stack_string(ldb);
553 g_print("Error in language file %s: patterns %s and %s in context %d overlap each other\n", dbstring, input,
554 g_array_index(st->matches, Tpattern,
555 get_tablerow(st, context, p).match).pattern, context);
556 g_free(dbstring);
557 } else {
558 get_tablerow(st, context, p).match = matchnum;
559 }
560 }
561 g_queue_free(positions);
562 g_queue_free(newpositions);
563 g_free(lregex);
564
565 #ifdef ENABLE_PRINT_DFA
566 if (context == 1) {
567 print_DFA_subset(st, context, "a=1; \n\t");
568 }
569 #endif
570 }
571
572 /* this function cannot do any regex style patterns
573 just full keywords */
574 static void
compile_keyword_to_DFA(Tscantable * st,const gchar * keyword,guint16 matchnum,gint16 context,gboolean case_insens,gpointer ldb)575 compile_keyword_to_DFA(Tscantable * st, const gchar * keyword, guint16 matchnum, gint16 context,
576 gboolean case_insens, gpointer ldb)
577 {
578 gint i, len;
579 GQueue *positions, *newpositions;
580 gchar *pattern;
581 gchar characters[NUMSCANCHARS];
582 gboolean end_is_symbol;
583
584 positions = g_queue_new();
585 newpositions = g_queue_new();
586
587 /* compile the keyword into the DFA */
588 g_queue_push_head(positions,
589 GINT_TO_POINTER((gint) 0));
590
591 if (!keyword) {
592 g_warning("compile_keyword_to_DFA, called without keyword!\n");
593 return;
594 }
595
596 if (case_insens) {
597 /* make complete string lowercase */
598 pattern = g_ascii_strdown(keyword, -1);
599 } else {
600 pattern = g_strdup(keyword);
601 }
602
603 DBG_PATCOMPILE("in context %d we start with position %d\n", (gint)context, GPOINTER_TO_INT(g_queue_peek_head(positions)));
604 len = strlen(pattern);
605
606 end_is_symbol =
607 character_is_symbol(st, context, (gint) pattern[len - 1]);
608
609 for (i = 0; i <= len; i++) {
610 /*GQueue *tmp; */
611 int c = pattern[i];
612 memset(&characters, 0, NUMSCANCHARS * sizeof(char));
613 if (c == '\0') {
614 while ((g_queue_get_length(positions))) {
615 gint p;
616 p = GPOINTER_TO_INT(g_queue_pop_head(positions));
617 DBG_PATCOMPILE("mark state %d as possible end-state\n", p);
618 if (get_tablerow(st, context, p).match != 0
619 && get_tablerow(st, context, p).match != matchnum) {
620 gchar *dbstring = ldb_stack_string(ldb);
621 g_print("Error in language file %s: patterns %s and %s in context %d overlap each other\n",
622 dbstring, keyword, g_array_index(st->matches, Tpattern,
623 get_tablerow(st, context, p).match).pattern,
624 context);
625 g_free(dbstring);
626 }
627 get_tablerow(st, context, p).match = matchnum;
628 }
629 } else {
630 characters[c] = 1;
631 if (case_insens && c >= 97 && c <= 122)
632 characters[c - 32] = 1;
633 create_state_tables(st, context, characters, FALSE, positions, newpositions, end_is_symbol);
634 }
635 g_queue_clear(positions);
636 pointer_switch_addresses((void *) &positions, (void *) &newpositions);
637 /*tmp = positions;
638 positions = newpositions;
639 newpositions = tmp; */
640 }
641 g_queue_clear(positions);
642 g_queue_clear(newpositions);
643 g_queue_free(positions);
644 g_queue_free(newpositions);
645 g_free(pattern);
646 }
647
648 /*
649 * symbols can be free'ed
650 * contexthighlight is directly stored in Tcontext
651 *
652 *
653 */
654 gint16
new_context(Tscantable * st,guint expected_size,const gchar * symbols,const gchar * contexthighlight,gboolean autocomplete_case_insens,gboolean default_spellcheck,gboolean dump_dfa_run)655 new_context(Tscantable * st, guint expected_size, const gchar * symbols, const gchar * contexthighlight,
656 gboolean autocomplete_case_insens, gboolean default_spellcheck, gboolean dump_dfa_run)
657 {
658 gint16 context;
659 gint i;
660 const gchar *tmp;
661 GArray *tmptable;
662
663 context = st->contexts->len;
664 g_array_set_size(st->contexts, st->contexts->len + 1);
665
666 g_array_index(st->contexts, Tcontext, context).autocomplete_case_insens = autocomplete_case_insens;
667 g_array_index(st->contexts, Tcontext, context).default_spellcheck = default_spellcheck;
668 g_array_index(st->contexts, Tcontext, context).dump_dfa_run = dump_dfa_run;
669 g_array_index(st->contexts, Tcontext, context).contexthighlight = (gchar *) contexthighlight;
670 tmptable = g_array_index(st->contexts, Tcontext, context).table = g_array_sized_new(TRUE, TRUE, sizeof(Ttablerow), expected_size);
671 g_array_set_size(g_array_index(st->contexts, Tcontext, context).table, 2); /* first two states are the startstate (0) and the identstate (1) */
672 /* identstate refers to itself for all characters except the symbols. we cannot use memset
673 because an guint16 occupies 2 bytes */
674 for (i = 0; i < NUMSCANCHARS; i++)
675 g_array_index(tmptable, Ttablerow, 1).row[i] = 1;
676
677 /* 0, \0 or NULL is always a symbol */
678 g_array_index(tmptable, Ttablerow, 1).row[0] = 0;
679 tmp = symbols;
680 while (*tmp) {
681 /*g_print("mark %c as symbol\n",*tmp); */
682 g_array_index(tmptable, Ttablerow, 1).row[(int) *tmp] = 0;
683
684 if (st->allsymbols[(int) *tmp] == 0)
685 st->allsymbols[(int) *tmp] = 1;
686
687 tmp++;
688 }
689 /* now copy the identstate to the startstate, so every symbol in the startstate will point to the identstate */
690 memcpy(g_array_index(tmptable, Ttablerow, 0).row,
691 g_array_index(tmptable, Ttablerow, 1).row, sizeof(guint16[NUMSCANCHARS]));
692
693 return context;
694 }
695
696 void
match_set_nextcontext(Tscantable * st,guint16 matchnum,guint16 nextcontext)697 match_set_nextcontext(Tscantable * st, guint16 matchnum, guint16 nextcontext)
698 {
699 DBG_PATCOMPILE("match_set_nextcontext, set match %d to have nextcontext %d\n", matchnum, nextcontext);
700 g_array_index(st->matches, Tpattern, matchnum).nextcontext = nextcontext;
701 }
702
703 void
match_autocomplete_reference(Tscantable * st,guint16 matchnum,guint16 context)704 match_autocomplete_reference(Tscantable * st, guint16 matchnum, guint16 context)
705 {
706 gint pattern_id = matchnum;
707 gboolean has_reference;
708
709 has_reference = (g_array_index(st->matches, Tpattern, matchnum).reference!=NULL);
710
711 if (!g_array_index(st->contexts, Tcontext, context).patternhash) {
712 g_array_index(st->contexts, Tcontext, context).patternhash =
713 g_hash_table_new(g_str_hash, g_str_equal);
714 }
715 if (has_reference) {
716 g_hash_table_insert(g_array_index(st->contexts, Tcontext, context).patternhash,
717 g_array_index(st->matches, Tpattern, matchnum).pattern, GINT_TO_POINTER(pattern_id));
718 }
719
720 if (g_array_index(st->matches, Tpattern, matchnum).tagclose_from_blockstack
721 && !g_array_index(st->contexts, Tcontext, context).has_tagclose_from_blockstack) {
722 DBG_AUTOCOMP("pattern %d in context %d has tagclose from blockstack!\n", matchnum, context);
723 g_array_index(st->contexts, Tcontext, context).has_tagclose_from_blockstack = 1;
724 }
725
726 if (g_array_index(st->matches, Tpattern, matchnum).autocomp_items) {
727 GSList *tmpslist = g_array_index(st->matches, Tpattern, matchnum).autocomp_items;
728 GList *list = NULL;
729 if (!g_array_index(st->contexts, Tcontext, context).ac) {
730 DBG_PATCOMPILE("create g_completion for context %d\n", context);
731 g_array_index(st->contexts, Tcontext, context).ac = g_completion_new(NULL);
732 if (g_array_index(st->contexts, Tcontext, context).autocomplete_case_insens)
733 g_completion_set_compare(g_array_index(st->contexts, Tcontext, context).ac, strncasecmp);
734 g_array_index(st->contexts, Tcontext, context).autocomplete_has_conditions = 0; /* defaults to 0 */
735 }
736 while (tmpslist) {
737 gboolean already_handled=FALSE;
738 Tpattern_autocomplete *pac = tmpslist->data;
739 if (pac->condition != 0) {
740 g_array_index(st->contexts, Tcontext, context).autocomplete_has_conditions = 1;
741
742 /* there is a special situation in Bluefish, if a attribute-context is re-used, a new conditional item
743 is added to the '>' pattern, and match_autocomplete_reference() is called AGAIN for the same pattern.
744 we can detect this by looking it up in the hash table */
745 if (GPOINTER_TO_INT(g_hash_table_lookup(g_array_index(st->contexts, Tcontext, context).patternhash,pac->autocomplete_string))==pattern_id) {
746 DBG_AUTOCOMP("match_autocomplete_reference, ignore autocomplete string %s, is already handled\n",pac->autocomplete_string);
747 already_handled = TRUE;
748 }
749 }
750 #ifdef DEVELOPMENT
751 if (g_strcmp0(pac->autocomplete_string, g_array_index(st->matches, Tpattern, matchnum).pattern)!=0) {
752 gint hashpat = GPOINTER_TO_INT(g_hash_table_lookup(g_array_index(st->contexts, Tcontext, context).patternhash,pac->autocomplete_string));
753 if (hashpat && !already_handled) {
754 g_print("autocompletion item %s is already in the context %d hash table for pattern %d, now handling pattern %d that has %d autocomplete entries?!\n", pac->autocomplete_string, context, hashpat, pattern_id, g_slist_length(g_array_index(st->matches, Tpattern, matchnum).autocomp_items));
755 }
756 }
757 #endif
758 if (!already_handled) {
759 list = g_list_prepend(list, pac->autocomplete_string);
760 /* we only need to add an autocomplete string to the hash table if the pattern has a
761 reference text, or backup_cursor is set, or trigger_new_autocomp_popup, or there
762 is a condition AND if the autocompletion patern is different than the pattern itself (which is
763 added to the hash table earlier in this function) */
764 if (g_strcmp0(pac->autocomplete_string, g_array_index(st->matches, Tpattern, matchnum).pattern)!=0 && (has_reference || pac->condition != 0 || pac->autocomplete_backup_cursor!=0 || pac->trigger_new_autocomp_popup!=0)) {
765 DBG_AUTOCOMP("match_autocomplete_reference, enter autocompletion item %s in hash table\n",pac->autocomplete_string);
766 g_hash_table_insert(g_array_index(st->contexts, Tcontext, context).patternhash,
767 pac->autocomplete_string, GINT_TO_POINTER(pattern_id));
768 }
769 }
770 tmpslist = g_slist_next(tmpslist);
771 }
772 g_completion_add_items(g_array_index(st->contexts, Tcontext, context).ac, list);
773 g_list_free(list);
774 }
775 }
776 /*
777 * refname should not be freed, it is set in Tpattern_condition
778 *
779 */
780 static guint16
add_condition(Tscantable * st,const gchar * refname,gint relation,gint mode)781 add_condition(Tscantable * st, const gchar *refname, gint relation, gint mode)
782 {
783 guint16 condnum = st->conditions->len;
784 DBG_PARSING("add_condition, condnum=%d, called with refname %s\n",condnum,refname);
785 g_array_set_size(st->conditions, st->conditions->len + 1);
786 if (mode < 1 || mode > 4) {
787 g_warning("Error in language file: condition mode %d is not defined\n",mode);
788 return 0;
789 }
790 g_array_index(st->conditions, Tpattern_condition, condnum).refname = g_strdup(refname);
791 g_array_index(st->conditions, Tpattern_condition, condnum).parentrelation = relation;
792 g_array_index(st->conditions, Tpattern_condition, condnum).relationtype = mode;
793 return condnum;
794 }
795
796 /*
797 * autocomplete_string should be free'ed
798 * autocomplete_append should be free'ed
799 * refname should be free'ed
800 *
801 *
802 */
803 void
match_add_autocomp_item(Tscantable * st,guint16 matchnum,const gchar * autocomplete_string,const gchar * autocomplete_append,guint8 autocomplete_backup_cursor,guint8 trigger_new_autocomp_popup,const gchar * refname,gint relation,gint mode)804 match_add_autocomp_item(Tscantable * st, guint16 matchnum, const gchar * autocomplete_string,
805 const gchar * autocomplete_append, guint8 autocomplete_backup_cursor,
806 guint8 trigger_new_autocomp_popup,
807 const gchar *refname, gint relation, gint mode)
808 {
809 Tpattern_autocomplete *pac = g_slice_new(Tpattern_autocomplete);
810 if (autocomplete_string) {
811 pac->autocomplete_string = g_strdup(autocomplete_string);
812 } else if (autocomplete_append) {
813 pac->autocomplete_string =
814 g_strconcat(g_array_index(st->matches, Tpattern, matchnum).pattern, autocomplete_append, NULL);
815 } else {
816 pac->autocomplete_string = g_array_index(st->matches, Tpattern, matchnum).pattern;
817 }
818
819 #ifdef DEVELOPMENT
820 GSList *slist;
821 slist=g_array_index(st->matches, Tpattern, matchnum).autocomp_items;
822 while (slist) {
823 Tpattern_autocomplete *tpac = slist->data;
824 if (g_strcmp0(pac->autocomplete_string, tpac->autocomplete_string)==0) {
825 g_print("Warning: string %s exists as autocomplete string for pattern %d\n",pac->autocomplete_string,matchnum);
826 }
827 slist = g_slist_next(slist);
828 }
829 #endif
830 pac->autocomplete_backup_cursor = autocomplete_backup_cursor;
831 pac->trigger_new_autocomp_popup = trigger_new_autocomp_popup;
832 pac->condition = (mode==0) ? 0 : add_condition(st, refname, relation, mode);
833 g_array_index(st->matches, Tpattern, matchnum).autocomp_items =
834 g_slist_prepend(g_array_index(st->matches, Tpattern, matchnum).autocomp_items, pac);
835 }
836
837 void
match_set_reference(Tscantable * st,guint16 matchnum,const gchar * reference)838 match_set_reference(Tscantable * st, guint16 matchnum, const gchar * reference)
839 {
840 if (reference)
841 g_array_index(st->matches, Tpattern, matchnum).reference = g_strdup(reference);
842 }
843
844 void
compile_existing_match(Tscantable * st,guint16 matchnum,gint16 context,gpointer ldb)845 compile_existing_match(Tscantable * st, guint16 matchnum, gint16 context, gpointer ldb)
846 {
847 DBG_PATCOMPILE("compile existing match %d (%s) in context %d\n", matchnum,
848 g_array_index(st->matches, Tpattern, matchnum).pattern, context);
849 if (g_array_index(st->matches, Tpattern, matchnum).is_regex) {
850 compile_limitedregex_to_DFA(st, g_array_index(st->matches, Tpattern, matchnum).pattern,
851 g_array_index(st->matches, Tpattern, matchnum).case_insens, matchnum,
852 context, ldb);
853 } else {
854 compile_keyword_to_DFA(st, g_array_index(st->matches, Tpattern, matchnum).pattern, matchnum, context,
855 g_array_index(st->matches, Tpattern, matchnum).case_insens, ldb);
856 }
857 match_autocomplete_reference(st, matchnum, context);
858 }
859
860 void
pattern_set_condition(Tscantable * st,guint16 matchnum,gchar * refname,gint relation,gint mode)861 pattern_set_condition(Tscantable * st, guint16 matchnum, gchar *refname, gint relation, gint mode)
862 {
863 guint16 condnum = add_condition(st, refname, relation, mode);
864 DBG_PARSING("pattern %d (%s) now has condition %d\n",matchnum,g_array_index(st->matches, Tpattern, matchnum).pattern,condnum);
865 g_array_index(st->matches, Tpattern, matchnum).condition = condnum;
866 }
867
868 void
pattern_set_blockmatch(Tscantable * st,guint16 matchnum,gboolean starts_block,gboolean ends_block,guint blockstartpattern,const gchar * blockhighlight,const gchar * blockname,gboolean foldable)869 pattern_set_blockmatch(Tscantable * st, guint16 matchnum,
870 gboolean starts_block,
871 gboolean ends_block,
872 guint blockstartpattern,
873 const gchar *blockhighlight,
874 const gchar *blockname,
875 gboolean foldable)
876 {
877 if (starts_block == TRUE && ends_block == TRUE) {
878 g_warning("Error in language file or Bluefish bug: pattern %s both starts and ends a block\n",
879 g_array_index(st->matches, Tpattern, matchnum).pattern);
880 }
881 if (blockname && !starts_block) {
882 g_warning("Error in language file or Bluefish bug: block_name %s can only be set on a block start\n", blockname);
883 }
884
885 if (starts_block) {
886 guint16 blocknum = 0;
887 if (blockname || blockhighlight) { /* only create a block if we need it */
888 blocknum = st->blocks->len;
889 g_array_set_size(st->blocks, st->blocks->len + 1);
890 g_array_index(st->blocks, Tpattern_block, blocknum).highlight = (gchar *) blockhighlight;
891 g_array_index(st->blocks, Tpattern_block, blocknum).name = (gchar *) blockname;
892 g_array_index(st->blocks, Tpattern_block, blocknum).foldable = foldable;
893 g_array_index(st->blocks, Tpattern_block, blocknum).tag = NULL;
894 }
895 g_array_index(st->matches, Tpattern, matchnum).starts_block = 1;
896 g_array_index(st->matches, Tpattern, matchnum).block = blocknum;
897 } else if (ends_block) {
898 g_array_index(st->matches, Tpattern, matchnum).ends_block = 1;
899 }
900 g_array_index(st->matches, Tpattern, matchnum).blockstartpattern = blockstartpattern;
901 }
902
903 void
pattern_set_runtime_properties(Tscantable * st,guint16 matchnum,const gchar * selfhighlight,gint16 nextcontext,gboolean tagclose_from_blockstack,gboolean stretch_blockstart,guint8 identmode,gboolean identjump,gboolean identautocomp)904 pattern_set_runtime_properties(Tscantable * st, guint16 matchnum,
905 const gchar * selfhighlight,
906 gint16 nextcontext,
907 gboolean tagclose_from_blockstack,
908 gboolean stretch_blockstart,
909 guint8 identmode,
910 gboolean identjump,
911 gboolean identautocomp)
912 {
913 g_array_index(st->matches, Tpattern, matchnum).selfhighlight = (gchar *) selfhighlight;
914 g_array_index(st->matches, Tpattern, matchnum).nextcontext = nextcontext;
915 g_array_index(st->matches, Tpattern, matchnum).tagclose_from_blockstack = tagclose_from_blockstack;
916 g_array_index(st->matches, Tpattern, matchnum).stretch_blockstart = stretch_blockstart;
917 if (identjump || identautocomp) {
918 g_array_index(st->matches, Tpattern, matchnum).identmode = identmode;
919 g_array_index(st->matches, Tpattern, matchnum).identaction = (identjump?1:0) | (identautocomp?2:0);
920 }
921 }
922
923 guint16
add_pattern_to_scanning_table(Tscantable * st,const gchar * pattern,gboolean is_regex,gboolean case_insens,gint16 context,gpointer ldb)924 add_pattern_to_scanning_table(Tscantable * st, const gchar * pattern,
925 gboolean is_regex,
926 gboolean case_insens,
927 gint16 context, gpointer ldb)
928 {
929 guint16 matchnum;
930 if (!pattern || pattern[0] == '\0') {
931 g_warning("corrupt language file: found empty pattern. Results are undefined\n");
932 return 0;
933 }
934 matchnum = st->matches->len;
935 if (matchnum == G_MAXUINT16) {
936 g_warning("Language file has too many patterns, this will very likely result in a crash!\n");
937 }
938 g_array_set_size(st->matches, st->matches->len + 1);
939 g_array_index(st->matches, Tpattern, matchnum).pattern = g_strdup(pattern);
940 g_array_index(st->matches, Tpattern, matchnum).case_insens = case_insens;
941 g_array_index(st->matches, Tpattern, matchnum).is_regex = is_regex;
942 DBG_PATCOMPILE("add_pattern_to_scanning_table,pattern=%s for context=%d got matchnum %d\n",pattern, context, matchnum);
943 if (is_regex) {
944 compile_limitedregex_to_DFA(st, g_array_index(st->matches, Tpattern, matchnum).pattern, case_insens, matchnum, context, ldb);
945 } else {
946 compile_keyword_to_DFA(st, g_array_index(st->matches, Tpattern, matchnum).pattern, matchnum, context, case_insens, ldb);
947 }
948 /*if (g_strcmp0(pattern, "rem ")==0 || g_strcmp0(pattern, "\\.?[a-zA-Z][a-zA-Z_0-9]*[\\$%]?")==0) {
949 print_DFA_subset(st, context, "rem var");
950 }*/
951 return matchnum;
952 }
953
954
955 /*guint16
956 add_keyword_to_scanning_table(Tscantable * st, gchar * pattern, const gchar * lang,
957 const gchar * selfhighlight, const gchar * blockhighlight, gboolean is_regex,
958 gboolean case_insens, gint16 context, gint16 nextcontext, gboolean starts_block,
959 gboolean ends_block, guint blockstartpattern,
960 gboolean tagclose_from_blockstack, gboolean stretch_blockstart,
961 guint8 identmode, gboolean identjump, gboolean identautocomp)
962 {
963 guint16 matchnum;
964
965 if (!pattern || pattern[0] == '\0') {
966 g_print("CORRUPT LANGUAGE FILE: empty pattern/tag/keyword\n");
967 return 0;
968 }
969
970 if (context == nextcontext)
971 DBG_PATCOMPILE("context=nextcontext=%d for %s\n", context, pattern);
972
973 matchnum = st->matches->len;
974 DBG_BLOCKMATCH("new match %s at matchnum %d has blockstartpattern %d and nextcontext %d\n", pattern,
975 matchnum, blockstartpattern, nextcontext);
976 g_array_set_size(st->matches, st->matches->len + 1);
977
978 g_array_index(st->matches, Tpattern, matchnum).pattern = g_strdup(pattern);
979 g_array_index(st->matches, Tpattern, matchnum).ends_block = ends_block;
980 g_array_index(st->matches, Tpattern, matchnum).starts_block = starts_block;
981 g_array_index(st->matches, Tpattern, matchnum).blockstartpattern = blockstartpattern;
982 g_array_index(st->matches, Tpattern, matchnum).blockhighlight = (gchar *) blockhighlight;
983 g_array_index(st->matches, Tpattern, matchnum).nextcontext = nextcontext;
984 g_array_index(st->matches, Tpattern, matchnum).case_insens = case_insens;
985 g_array_index(st->matches, Tpattern, matchnum).is_regex = is_regex;
986 g_array_index(st->matches, Tpattern, matchnum).selfhighlight = (gchar *) selfhighlight;
987
988 g_array_index(st->matches, Tpattern, matchnum).tagclose_from_blockstack = tagclose_from_blockstack;
989 g_array_index(st->matches, Tpattern, matchnum).stretch_blockstart = stretch_blockstart;
990 #ifdef IDENTSTORING
991 if (identjump || identautocomp) {
992 g_array_index(st->matches, Tpattern, matchnum).identmode = identmode;
993 g_array_index(st->matches, Tpattern, matchnum).identaction = (identjump?1:0) | (identautocomp?2:0);
994 }
995 #endif
996 DBG_PATCOMPILE
997 ("add_keyword_to_scanning_table,pattern=%s,starts_block=%d,ends_block=%d,blockstartpattern=%d, context=%d,nextcontext=%d and got matchnum %d\n",
998 pattern, starts_block, ends_block, blockstartpattern, context, nextcontext, matchnum);
999
1000 if (is_regex) {
1001 compile_limitedregex_to_DFA(st, pattern, case_insens, matchnum, context);
1002 } else {
1003 compile_keyword_to_DFA(st, pattern, matchnum, context, case_insens);
1004 }
1005 / *print_DFA(st, 'a', 'z'); * /
1006 return matchnum;
1007 }*/
1008
1009 void
print_DFA_subset(Tscantable * st,gint16 context,char * chars)1010 print_DFA_subset(Tscantable * st, gint16 context, char *chars)
1011 {
1012 gint i, j, len;
1013 len = strlen(chars);
1014 g_print("***************** print subset of DFA table for context %d\n",(gint)context);
1015 g_print(" ");
1016 for (j = 0; j < len; j++) {
1017 switch(chars[j]) {
1018 case '\n':
1019 g_print(" \\n ");
1020 break;
1021 case '\r':
1022 g_print(" \\r ");
1023 break;
1024 case '\t':
1025 g_print(" \\t ");
1026 break;
1027 default:
1028 g_print(" '%c' ", chars[j]);
1029 }
1030 }
1031 g_print(": match\n");
1032 for (i = 0; i < g_array_index(st->contexts, Tcontext, context).table->len; i++) {
1033 g_print("%4d: ", i);
1034 for (j = 0; j < len; j++) {
1035 g_print("%4d ", g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).row[(gshort) chars[j]]);
1036 }
1037 g_print(" : %4d", g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match);
1038 if (i==0) {
1039 g_print(" this is the startstate");
1040 } else if (i==1) {
1041 g_print(" this is the identstate");
1042 }
1043 if (g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match > 0) {
1044 g_print(" %s",
1045 g_array_index(st->matches, Tpattern,
1046 g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match).pattern);
1047 if (g_array_index(st->matches, Tpattern, g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match).nextcontext
1048 > 0) {
1049 g_print(" --> goto context %d",
1050 g_array_index(st->matches, Tpattern,
1051 g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match).nextcontext);
1052 } else
1053 if (g_array_index
1054 (st->matches, Tpattern, g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match).nextcontext < 0) {
1055 g_print(" --> pop context: %d",
1056 g_array_index(st->matches, Tpattern,
1057 g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match).nextcontext);
1058 }
1059 }
1060 g_print("\n");
1061 }
1062 g_print("*****************\n");
1063 }
1064 #ifdef ENABLE_PRINT_DFA
1065 void
print_DFA(Tscantable * st,gint16 context,char start,char end)1066 print_DFA(Tscantable * st, gint16 context, char start, char end)
1067 {
1068 gint i, j;
1069 g_print(" ");
1070 for (j = start; j <= end; j++) {
1071 g_print(" %c ", j);
1072 }
1073 g_print(": match\n");
1074 for (i = 0; i < g_array_index(st->contexts, Tcontext, context).table->len; i++) {
1075 g_print("%3d: ", i);
1076 for (j = start; j <= end; j++) {
1077 g_print("%3d ", g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).row[j]);
1078 }
1079 g_print(": %3d (%s)\n", g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match,
1080 g_array_index(st->matches, Tpattern,g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match).pattern);
1081 }
1082
1083 }
1084 #endif /* ENABLE_PRINT_DFA */
1085
1086 Tscantable *
scantable_new(guint size_table,guint size_matches,guint size_contexts)1087 scantable_new(guint size_table, guint size_matches, guint size_contexts)
1088 {
1089 Tscantable *st;
1090 st = g_slice_new0(Tscantable);
1091 DBG_PARSING("scantable_new, initial size table %d, contexts %d, matches %d\n", size_table, size_matches,
1092 size_contexts);
1093 st->contexts = g_array_sized_new(TRUE, TRUE, sizeof(Tcontext), size_contexts);
1094 st->matches = g_array_sized_new(TRUE, TRUE, sizeof(Tpattern), size_matches);
1095 st->comments = g_array_sized_new(TRUE, FALSE, sizeof(Tcomment), 8);
1096 st->blocks = g_array_sized_new(TRUE, FALSE, sizeof(Tpattern_block), 8);
1097 st->conditions = g_array_sized_new(TRUE, FALSE, sizeof(Tpattern_block), 8);
1098 st->matches->len = 1; /* match 0 means no match */
1099 st->contexts->len = 1; /* a match with nextcontext 0 means no context change, so we cannot use context 0 */
1100 st->blocks->len = 1; /* block 0 means no block */
1101 st->conditions->len = 1;
1102 return st;
1103 }
1104
1105