1 /* Bluefish HTML Editor
2  * bftextview2_patcompile.c
3  *
4  * Copyright (C) 2008,2009,2011,2013,2014,2015 Olivier Sessink
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 /*#define ENABLE_PRINT_DFA*/
21 
22 /* for the design docs see bftextview2.h */
23 #include <string.h>
24 #include "bluefish.h"
25 #include "bftextview2_private.h"
26 #include "bftextview2_patcompile.h"
27 #include "bftextview2_langmgr.h"
28 #include "bf_lib.h"
29 /*
30 we do regex pattern support as well.
31 
32 we don't do everything that pcre or posix regex patterns can
33 do - these engines have features that cannot be done in a DFA (such as backtracking)
34 
35 There are several ways in which regex patterns can be simplified:
36 
37 a(bc)+ == abc(abc)*
38 [a-z]+ == [a-z][a-z]*
39 OR reverse:
40 a(bc)* == (a|a[bc]+)
41 a[a-z]* == (a|a[a-z]+)
42 
43 a(bc)? = (a|abc)
44 a[a-z]? = (a|a[a-z])
45 
46 a(b|c)d == (abd|acd)
47 
48 [^a] == [b-z] (and all other ascii characters, for simplification I just use the alphabet)
49 
50 so at least we need to be able to compile:
51 the OR construct: (|)
52 the zero-or-more *
53 the character list [a-z]
54 
55 */
56 
57 #ifdef ENABLE_PRINT_DFA
58 void print_DFA_subset(Tscantable * st, gint16 context, char *chars);
59 #endif
60 
61 static gint
pointersort_compare(gconstpointer a,gconstpointer b)62 pointersort_compare(gconstpointer a, gconstpointer b)
63 {
64 	return a > b ? -1 : (a == b ? 0 : 1);
65 }
66 
67 /* returns a list of tags that are used in this language */
68 GList *
bftextview2_scantable_rematch_highlights(Tscantable * st,const gchar * lang)69 bftextview2_scantable_rematch_highlights(Tscantable * st, const gchar * lang)
70 {
71 	int i = 0;
72 	GList *retlist = NULL, *tmplist;
73 	gpointer temp = NULL;
74 	for (i = 0; i < (st->contexts->len); i++) {
75 /*		g_print("context %d",i);
76 		g_print(" has highlight %s\n",g_array_index(st->contexts, Tcontext, i).contexthighlight);*/
77 		if (g_array_index(st->contexts, Tcontext, i).contexthighlight) {
78 			g_array_index(st->contexts, Tcontext, i).contexttag =
79 				langmrg_lookup_tag_highlight(lang, g_array_index(st->contexts, Tcontext, i).contexthighlight);
80 			if (g_array_index(st->contexts, Tcontext, i).contexttag) {
81 				retlist = g_list_prepend(retlist, g_array_index(st->contexts, Tcontext, i).contexttag);
82 			} else {
83 				g_print("Possible error in language file, no textstyle found for context highlight %s\n",
84 						g_array_index(st->contexts, Tcontext, i).contexthighlight);
85 			}
86 		}
87 	}
88 	for (i = 0; i < (st->matches->len); i++) {
89 		/*g_print("pattern %d",i);
90 		   g_print(" (%s)",g_array_index(st->matches, Tpattern, i).pattern);
91 		   g_print(" has selfhighlight %s and blockhighlight %s\n",g_array_index(st->matches, Tpattern, i).selfhighlight,g_array_index(st->matches, Tpattern, i).blockhighlight); */
92 		if (g_array_index(st->matches, Tpattern, i).selfhighlight) {
93 			g_array_index(st->matches, Tpattern, i).selftag =
94 				langmrg_lookup_tag_highlight(lang, g_array_index(st->matches, Tpattern, i).selfhighlight);
95 			if (g_array_index(st->matches, Tpattern, i).selftag)
96 				retlist = g_list_prepend(retlist, g_array_index(st->matches, Tpattern, i).selftag);
97 			else
98 				g_print("Possible error in language file, no textstyle found for highlight %s\n",
99 						g_array_index(st->matches, Tpattern, i).selfhighlight);
100 		}
101 		/*if (g_array_index(st->matches, Tpattern, i).blockhighlight) {
102 			g_array_index(st->matches, Tpattern, i).blocktag =
103 				langmrg_lookup_tag_highlight(lang, g_array_index(st->matches, Tpattern, i).blockhighlight);
104 			if (g_array_index(st->matches, Tpattern, i).blocktag)
105 				retlist = g_list_prepend(retlist, g_array_index(st->matches, Tpattern, i).blocktag);
106 		}*/
107 	}
108 	for (i = 0; i < (st->blocks->len); i++) {
109 		if (g_array_index(st->blocks, Tpattern_block, i).highlight) {
110 			g_array_index(st->blocks, Tpattern_block, i).tag = langmrg_lookup_tag_highlight(lang, g_array_index(st->blocks, Tpattern_block, i).highlight);
111 			if (g_array_index(st->blocks, Tpattern_block, i).tag) {
112 				retlist = g_list_prepend(retlist, g_array_index(st->blocks, Tpattern_block, i).tag);
113 			} else {
114 				g_print("Possible error in language file, no textstyle found for block highlight %s\n",
115 						g_array_index(st->blocks, Tpattern_block, i).highlight);
116 			}
117 		}
118 	}
119 
120 	/* now remove all duplicate tags */
121 	tmplist = retlist = g_list_sort(retlist, (GCompareFunc) pointersort_compare);
122 	while (tmplist) {
123 		if (tmplist->data == temp) {
124 			GList *tofree = tmplist;
125 			tmplist = tmplist->next;
126 			/* duplicate ! */
127 			tofree->prev->next = tmplist;
128 			if (tmplist)
129 				tmplist->prev = tofree->prev;
130 			g_list_free_1(tofree);
131 		} else {
132 			/* not duplicate */
133 			temp = tmplist->data;
134 			tmplist = tmplist->next;
135 		}
136 	}
137 
138 	return retlist;
139 }
140 
141 /*static void print_characters(gchar *characters) {
142 	int i;
143 	DBG_PATCOMPILE("we have active characters ");
144 	for (i=0;i<NUMSCANCHARS;i++) {
145 		if (characters[i]==1)
146 			DBG_PATCOMPILE("%c ",i);
147 	}
148 	DBG_PATCOMPILE("\n");
149 }*/
150 
151 static gint
fill_characters_from_range(gchar * input,gchar * characters)152 fill_characters_from_range(gchar * input, gchar * characters)
153 {
154 	gboolean reverse = 0;
155 	gint i = 0;
156 	if (input[i] == '^') {		/* see if it is a NOT pattern */
157 		reverse = 1;
158 		memset(characters, 1, NUMSCANCHARS * sizeof(char));
159 		i++;
160 	}
161 	if (input[i] == ']') {		/* ] is actually part of the collection */
162 		characters['['] = 1;
163 		i++;
164 	}
165 	if (input[i] == '-') {		/* - is actually part of the collection */
166 		characters['-'] = 1;
167 		i++;
168 	}
169 	while (input[i] != ']') {
170 		if (input[i] == '-') {	/* range all characters between the previous and the next char */
171 			gchar j;
172 			if (input[i + 1] == ']') {	/* - is actually part of the collection */
173 				characters['-'] = 1;
174 				i++;
175 			} else {
176 				DBG_PATCOMPILE("set characters from %c to %c to %d\n", input[i - 1], input[i + 1],
177 							   1 - reverse);
178 				for (j = input[i - 1]; j <= input[i + 1]; j++) {
179 					characters[(int) j] = 1 - reverse;
180 				}
181 				i += 2;
182 			}
183 		} else {
184 			DBG_PATCOMPILE("set characters %c (pos %d) to %d\n", input[i], input[i], 1 - reverse);
185 			characters[(int) input[i]] = 1 - reverse;
186 			i++;
187 		}
188 	}
189 	return i;
190 }
191 
192 static void
push_on_stack(GQueue * stack,gint state)193 push_on_stack(GQueue * stack, gint state)
194 {
195 	GList *tmplist;
196 	tmplist = stack->head;
197 	while (tmplist) {
198 		if (GPOINTER_TO_INT(tmplist->data) == state)
199 			return;
200 		tmplist = tmplist->next;
201 	}
202 	g_queue_push_head(stack, GINT_TO_POINTER(state));
203 }
204 
205 static void
create_state_tables(Tscantable * st,gint16 context,gchar * characters,gboolean pointtoself,GQueue * positions,GQueue * newpositions,gboolean end_is_symbol)206 create_state_tables(Tscantable * st, gint16 context, gchar * characters, gboolean pointtoself,
207 					GQueue * positions, GQueue * newpositions, gboolean end_is_symbol)
208 {
209 	guint c, pos;
210 	const guint identstate =1;
211 	gint newstate = -1;			/* if all characters can follow existing states we don't need any newstate
212 								   and thus newstate==-1 but if one or more characters in one or more states need a new state
213 								   it will be >0 */
214 	DBG_PATCOMPILE("create_state_tables, context=%d, started with get_length(positions)=%d states, pointtoself=%d, end_is_symbol=%d\n",
215 				   (gint)context,g_queue_get_length(positions), pointtoself,end_is_symbol);
216 	while (g_queue_get_length(positions)) {
217 		pos = GPOINTER_TO_INT(g_queue_pop_head(positions));
218 		/*DBG_PATCOMPILE("working on position %d, identstate=%d\n", pos, identstate);*/
219 		for (c = 0; c < NUMSCANCHARS; c++) {
220 			if (characters[c] == 1) {
221 				DBG_PATCOMPILE("running for position %d char %c (=%d)\n",pos,c,c);
222 				/* there is a bug in the pattern compiler that may compile a buggy language file without a warning:
223 				if a new keyword is added, and all characters already exist there are no new states created: this function
224 				will simply follow the existing states. BUT IT DOES NOT CHECK IF THERE ARE MULTIPLE WAYS TO GET TO THESE
225 				EXISTING STATES. So if those states are created by a regex pattern that allows jumping to these states via
226 				various other characters, you will have an overlapping pattern WITHOUT WARNING.
227 
228 				You can avoid this bug by having regex patterns alwats in the end of a context.
229 				 */
230 
231 
232 				if (get_tablerow(st, context, pos).row[c] != 0
233 					&& get_tablerow(st, context, pos).row[c] != identstate) {
234 					if (get_tablerow(st, context, pos).row[c] == pos) {
235 						/* this state is a 'morestate', a state that points to itself, for a (regex) pattern such as a+ or a* */
236 						if (pointtoself) {
237 							push_on_stack(newpositions,
238 										  (gint) get_tablerow(st, context, pos).row[c]);
239 						} else {
240 							if (newstate == -1) {
241 								newstate = get_table(st, context)->len;
242 								DBG_PATCOMPILE("create_state_tables, create newstate %d from morestate %d\n",
243 											   newstate, pos);
244 								g_array_set_size(get_table(st,context), get_table(st,context)->len + 1);
245 								if (get_table(st,context)->len+1 >= G_MAXUINT16) {
246 									g_print("Critical error in language file: state table overflow!!!!!!!!\n");
247 								}
248 								/* pass-on the morestate */
249 								memcpy(get_tablerow(st, context, newstate).row,
250 									   get_tablerow(st, context, pos).row,
251 									   sizeof(guint16[NUMSCANCHARS]));
252 								get_tablerow(st, context, pos).row[c] = newstate;
253 								g_queue_push_head(newpositions, GINT_TO_POINTER(newstate));
254 							} else {
255 								get_tablerow(st, context, pos).row[c] = newstate;
256 							}
257 							/*g_print("*****************************\nBUG: we cannot parse this pattern after a regex pattern yet !!!!!!!!!!!!!\nstate tables will be incorrect!!!\n*****************************\n"); */
258 						}
259 					} else {
260 						if (pointtoself) {	/* perhaps check if we have this state on the stack already */
261 							push_on_stack(positions, (gint) get_tablerow(st, context, pos).row[c]);
262 						} else {
263 							if (get_tablerow(st, context, pos).row[c] > pos) {
264 								DBG_PATCOMPILE("points to state %d, pushing %d on newpositions\n", (gint) get_tablerow(st, context, pos).row[c], (gint) get_tablerow(st, context, pos).row[c]);
265 								push_on_stack(newpositions,
266 											  (gint) get_tablerow(st, context, pos).row[c]);
267 							} else {	/* this is probably a morestate that has been passed-on several states */
268 								if (newstate == -1) {
269 									newstate = get_table(st, context)->len;
270 									DBG_PATCOMPILE
271 										("create_state_tables, create newstate %d from morestate %d\n",
272 										 newstate, pos);
273 									g_array_set_size(get_table(st,context), get_table(st,context)->len + 1);
274 									if (get_table(st,context)->len+1 >= G_MAXUINT16) {
275 										g_print("Critical error in language file: state table overflow!!!!!!!!\n");
276 									}
277 									/* pass-on the morestate */
278 									memcpy(get_tablerow(st, context, newstate).row,
279 										   get_tablerow(st, context, pos).row,
280 										   sizeof(guint16[NUMSCANCHARS]));
281 									get_tablerow(st, context, pos).row[c] = newstate;
282 									g_queue_push_head(newpositions, GINT_TO_POINTER(newstate));
283 								} else {
284 									get_tablerow(st, context, pos).row[c] = newstate;
285 								}
286 							}
287 
288 						}
289 					}
290 				} else {
291 					if (newstate == -1) {
292 						get_tablerow(st, context, pos).row[c] = newstate = get_table(st,context)->len;
293 						DBG_PATCOMPILE("create_state_tables, create newstate %d, pointtoself=%d\n", newstate,
294 									   pointtoself);
295 						g_array_set_size(get_table(st,context), get_table(st,context)->len + 1);
296 						if (get_table(st,context)->len+1 >= G_MAXUINT16) {
297 							g_print("Critical error in language file: state table overflow!!!!!!!!\n");
298 						}
299 						if (!character_is_symbol(st, context, c)) {
300 							/* normally this memcpy copies the identstate to the current state such that all symbols still point to the
301 							   startstate but all non-symbols point to the identstate */
302 							DBG_PATCOMPILE
303 								("create_state_tables, newstate %d does not end on a symbol, init all values equal to the identstate %d\n",
304 								 newstate, 1);
305 							memcpy(get_tablerow(st,context,newstate).row,
306 								   get_tablerow(st,context,identstate).row,
307 								   sizeof(guint16[NUMSCANCHARS]));
308 						} else {
309 							DBG_PATCOMPILE
310 								("create_state_tables, newstate %d ends on a symbol, so init all values with 0\n",
311 								 newstate);
312 						}
313 						g_queue_push_head(newpositions, GINT_TO_POINTER(newstate));
314 						if (pointtoself) {
315 							guint d;
316 							for (d = 0; d < NUMSCANCHARS; d++) {
317 								if (characters[d] == 1) {
318 									/*DBG_PATCOMPILE("in newstate %d, character %c points to %d\n",newstate,d,newstate); */
319 									get_tablerow(st,context,newstate).row[d] = newstate;
320 								}
321 							}
322 						}
323 					} else {
324 						get_tablerow(st,context,pos).row[c] = newstate;
325 						/* nothing to put on the stack, newstate should be on newpositions already if it is not -1 */
326 					}
327 				}
328 			}
329 		}
330 	}
331 	DBG_PATCOMPILE("create_state_tables, done, get_lenght(newpositions)=%d\n", g_queue_get_length(newpositions));
332 }
333 
334 static void
merge_queues(GQueue * target,GQueue * src)335 merge_queues(GQueue * target, GQueue * src)
336 {
337 	while (g_queue_get_length(src)) {
338 		DBG_PATCOMPILE("merge queue, push state %d to queue\n", GPOINTER_TO_INT(g_queue_peek_head(src)));
339 		g_queue_push_head(target, g_queue_pop_head(src));
340 	}
341 	DBG_PATCOMPILE("merge queue, target queue has length %d \n", g_queue_get_length(target));
342 }
343 
344 static GQueue *process_regex_part(Tscantable * st, gchar * regexpart, gint16 context,
345 								  gboolean caseinsensitive, GQueue * inputpositions,
346 								  gboolean regexpart_ends_regex);
347 
348 static GQueue *
run_subpatterns(Tscantable * st,gchar * regexpart,gint16 context,gboolean caseinsensitive,GQueue * inputpositions,gint * regexpartpos,gboolean regexpart_ends_regex)349 run_subpatterns(Tscantable * st, gchar * regexpart, gint16 context, gboolean caseinsensitive,
350 				GQueue * inputpositions, gint * regexpartpos, gboolean regexpart_ends_regex)
351 {
352 	gint j = 0;
353 	gchar *target;
354 	GQueue *mergednewpositions = g_queue_new();
355 	target = g_strdup(&regexpart[*regexpartpos]);	/* a very easy way to make target a buffer long enough to hold any subpattern */
356 
357 	while (regexpart[*regexpartpos] != '\0') {
358 		DBG_PATCOMPILE("run_subpatterns, regexpart[%d]=%c\n", *regexpartpos, regexpart[*regexpartpos]);
359 		if (regexpart[*regexpartpos] == '\\') {
360 			if (regexpart[*regexpartpos + 1] == '|' || regexpart[*regexpartpos + 1] == ')') {
361 				target[j] = regexpart[*regexpartpos + 1];
362 				*regexpartpos = *regexpartpos + 1;
363 				j++;
364 			} else {
365 				target[j] = regexpart[*regexpartpos];
366 				j++;
367 			}
368 		} else if ((regexpart[*regexpartpos] == '|' || regexpart[*regexpartpos] == ')')) {
369 			/* found a subpattern */
370 			GQueue *newpositions;
371 			target[j] = '\0';
372 			DBG_PATCOMPILE("at regexpartpos=%d found SUBPATTERN %s\n", *regexpartpos, target);
373 			newpositions =
374 				process_regex_part(st, target, context, caseinsensitive, inputpositions,
375 								   regexpart_ends_regex);
376 			merge_queues(mergednewpositions, newpositions);
377 			g_queue_free(newpositions);
378 			j = 0;
379 
380 			if (regexpart[*regexpartpos] == ')') {
381 				g_free(target);
382 				return mergednewpositions;
383 			}
384 		} else {
385 			target[j] = regexpart[*regexpartpos];
386 			j++;
387 		}
388 		*regexpartpos = *regexpartpos + 1;
389 	}
390 	g_free(target);
391 	return mergednewpositions;
392 }
393 
394 /* this function can be called recursively for subpatterns. It gets all current valid states in
395 inputpositions, and returns all valid outputstates.
396 
397 regexpart_ends_regex is a boolean that is used to set the state for the last character of the regex.
398 if the last character of a pattern is a symbol, the states are different. because this function is
399 called recursively this is only relevant for the first call.
400 BUG: this is not true, for a pattern like <(a|b) the a and b are both end-patterns, but they are handled in a
401 subpattern
402 
403 
404 */
405 static GQueue *
process_regex_part(Tscantable * st,gchar * regexpart,gint16 context,gboolean caseinsensitive,GQueue * inputpositions,gboolean regexpart_ends_regex)406 process_regex_part(Tscantable * st, gchar * regexpart, gint16 context, gboolean caseinsensitive,
407 				   GQueue * inputpositions, gboolean regexpart_ends_regex)
408 {
409 	gboolean escaped = FALSE;
410 	gint i = 0;
411 	char characters[NUMSCANCHARS];
412 	GQueue *newpositions, *positions, *tmp;
413 
414 	positions = g_queue_copy(inputpositions);
415 	newpositions = g_queue_new();
416 	DBG_PATCOMPILE("process_regex_part, running part %s\n", regexpart);
417 	while (1) {
418 		memset(&characters, 0, NUMSCANCHARS * sizeof(char));
419 		escaped = 0;
420 		DBG_PATCOMPILE("start of loop, regexpart[%d]=%c, have %d positions\n", i, regexpart[i],
421 					   g_queue_get_length(positions));
422 		if (regexpart[i] == '\0') {	/* end of pattern */
423 			DBG_PATCOMPILE("end of pattern, positions(%p) has %d entries\n", positions,
424 						   g_queue_get_length(positions));
425 			g_queue_free(newpositions);
426 			return positions;
427 		} else {
428 			if (!escaped && regexpart[i] == '\\') {
429 				DBG_PATCOMPILE("found \\ at position %d:next pattern is escaped\n", i);
430 				escaped = TRUE;
431 				i++;
432 			}
433 			if (!escaped && regexpart[i] == '(') {
434 				gchar *tmp;
435 				gboolean subpattern_ends_regex = FALSE;
436 				/* a subpattern */
437 				DBG_PATCOMPILE("found subpatern start at %d\n", i);
438 				i++;
439 				tmp = strchr(&regexpart[i], ')');
440 				/* BUG: this code doesn't handle an escaped \) sequence */
441 				if (tmp) {
442 					tmp++;
443 					if (*tmp == '\0')
444 						subpattern_ends_regex = TRUE;
445 				}
446 				newpositions =
447 					run_subpatterns(st, regexpart, context, caseinsensitive, positions, &i,
448 									subpattern_ends_regex);
449 				DBG_PATCOMPILE("end of subpatern at %d (%c)\n", i, regexpart[i]);
450 				/* BUG: if the subpattern is followed by an operator foo(bla)? this is not handled by the code !!!!!!!! */
451 			} else {
452 				gboolean end_is_symbol = FALSE;
453 				if (!escaped) {
454 					switch (regexpart[i]) {
455 					case '.':
456 						memset(&characters, 1, NUMSCANCHARS * sizeof(char));
457 						break;
458 					case '[':
459 						DBG_PATCOMPILE("found range at i=%d, fill characters\n", i);
460 						i += fill_characters_from_range(&regexpart[i + 1], characters) + 1;
461 						break;
462 					default:
463 						characters[(int) regexpart[i]] = 1;
464 						break;
465 					}
466 				} else {		/* escaped */
467 					DBG_PATCOMPILE("character %c was escaped, adding to characters\n", regexpart[i]);
468 					characters[(int) regexpart[i]] = 1;
469 				}
470 				/* handle case insensitiveness */
471 				if (caseinsensitive) {
472 					gint j;
473 					for (j = 'a'; j <= 'z'; j++) {
474 						if (characters[j] == 1)
475 							characters[j - 32] = 1;
476 						if (characters[j - 32] == 1)
477 							characters[j] = 1;
478 					}
479 				}
480 				/* TODO end_is_symbol no longer used in create_state_tables() , so it can be removed*/
481 				DBG_PATCOMPILE("i=%d, testing i+1  (%c) for operator\n", i, regexpart[i + 1]);
482 				/*print_characters(characters); */
483 				/* see if there is an operator */
484 				if (regexpart[i] == '\0') {
485 					DBG_PATCOMPILE("no operator, end of (sub)pattern\n");
486 					create_state_tables(st, context, characters, FALSE, positions, newpositions,
487 										end_is_symbol);
488 				} else if (regexpart[i + 1] == '+') {
489 					DBG_PATCOMPILE("handling + operator\n");
490 					create_state_tables(st, context, characters, TRUE, positions, newpositions,
491 										end_is_symbol);
492 					i++;
493 				} else if (regexpart[i + 1] == '*') {
494 					GQueue *tmp = g_queue_copy(positions);
495 					DBG_PATCOMPILE("handling * operator\n");
496 					create_state_tables(st, context, characters, TRUE, positions, newpositions,
497 										end_is_symbol);
498 					merge_queues(newpositions, tmp);
499 					g_queue_free(tmp);
500 					i++;
501 				} else if (regexpart[i + 1] == '?') {
502 					GQueue *tmp = g_queue_copy(positions);
503 					DBG_PATCOMPILE("handling ? operator\n");
504 					create_state_tables(st, context, characters, FALSE, positions, newpositions,
505 										end_is_symbol);
506 					merge_queues(newpositions, tmp);
507 					g_queue_free(tmp);
508 					i++;
509 				} else {
510 					DBG_PATCOMPILE("no operator\n");
511 					create_state_tables(st, context, characters, FALSE, positions, newpositions,
512 										end_is_symbol);
513 				}
514 			}
515 			g_queue_clear(positions);
516 			tmp = positions;
517 			positions = newpositions;
518 			newpositions = tmp;
519 		}
520 		i++;
521 	}
522 	/* cannot get here */
523 }
524 
525 static void
compile_limitedregex_to_DFA(Tscantable * st,gchar * input,gboolean caseinsensitive,guint16 matchnum,gint16 context,gpointer ldb)526 compile_limitedregex_to_DFA(Tscantable * st, gchar * input, gboolean caseinsensitive, guint16 matchnum,
527 							gint16 context, gpointer ldb)
528 {
529 	GQueue *positions, *newpositions;
530 	gchar *lregex;
531 	gint p;
532 	positions = g_queue_new();
533 
534 	if (caseinsensitive) {
535 		/* make complete string lowercase */
536 		lregex = g_ascii_strdown(input, -1);
537 	} else {
538 		lregex = g_strdup(input);
539 	}
540 
541 	g_queue_push_head(positions,
542 					  GINT_TO_POINTER((gint) 0));
543 	DBG_PATCOMPILE("lregex=%s, positionstack has length %d\n", lregex, g_queue_get_length(positions));
544 	newpositions = process_regex_part(st, lregex, context, caseinsensitive, positions, TRUE);
545 	/*compile_limitedregex_to_DFA_backend(st,lregex,context,caseinsensitive,&positions); */
546 	DBG_PATCOMPILE("after compiling positionstack has length %d\n", g_queue_get_length(positions));
547 	while ((g_queue_get_length(newpositions))) {
548 		p = GPOINTER_TO_INT(g_queue_pop_head(newpositions));
549 		DBG_PATCOMPILE("mark state %d as possible end-state\n", p);
550 		if (get_tablerow(st, context, p).match != 0
551 			&& get_tablerow(st, context, p).match != matchnum) {
552 			gchar *dbstring = ldb_stack_string(ldb);
553 			g_print("Error in language file %s: patterns %s and %s in context %d overlap each other\n", dbstring, input,
554 					g_array_index(st->matches, Tpattern,
555 								  get_tablerow(st, context, p).match).pattern, context);
556 			g_free(dbstring);
557 		} else {
558 			get_tablerow(st, context, p).match = matchnum;
559 		}
560 	}
561 	g_queue_free(positions);
562 	g_queue_free(newpositions);
563 	g_free(lregex);
564 
565 #ifdef ENABLE_PRINT_DFA
566 	if (context == 1) {
567 		print_DFA_subset(st, context, "a=1; \n\t");
568 	}
569 #endif
570 }
571 
572 /* this function cannot do any regex style patterns
573 just full keywords */
574 static void
compile_keyword_to_DFA(Tscantable * st,const gchar * keyword,guint16 matchnum,gint16 context,gboolean case_insens,gpointer ldb)575 compile_keyword_to_DFA(Tscantable * st, const gchar * keyword, guint16 matchnum, gint16 context,
576 					   gboolean case_insens, gpointer ldb)
577 {
578 	gint i, len;
579 	GQueue *positions, *newpositions;
580 	gchar *pattern;
581 	gchar characters[NUMSCANCHARS];
582 	gboolean end_is_symbol;
583 
584 	positions = g_queue_new();
585 	newpositions = g_queue_new();
586 
587 	/* compile the keyword into the DFA */
588 	g_queue_push_head(positions,
589 					  GINT_TO_POINTER((gint) 0));
590 
591 	if (!keyword) {
592 		g_warning("compile_keyword_to_DFA, called without keyword!\n");
593 		return;
594 	}
595 
596 	if (case_insens) {
597 		/* make complete string lowercase */
598 		pattern = g_ascii_strdown(keyword, -1);
599 	} else {
600 		pattern = g_strdup(keyword);
601 	}
602 
603 	DBG_PATCOMPILE("in context %d we start with position %d\n", (gint)context, GPOINTER_TO_INT(g_queue_peek_head(positions)));
604 	len = strlen(pattern);
605 
606 	end_is_symbol =
607 		character_is_symbol(st, context, (gint) pattern[len - 1]);
608 
609 	for (i = 0; i <= len; i++) {
610 		/*GQueue *tmp; */
611 		int c = pattern[i];
612 		memset(&characters, 0, NUMSCANCHARS * sizeof(char));
613 		if (c == '\0') {
614 			while ((g_queue_get_length(positions))) {
615 				gint p;
616 				p = GPOINTER_TO_INT(g_queue_pop_head(positions));
617 				DBG_PATCOMPILE("mark state %d as possible end-state\n", p);
618 				if (get_tablerow(st, context, p).match != 0
619 					&& get_tablerow(st, context, p).match != matchnum) {
620 					gchar *dbstring = ldb_stack_string(ldb);
621 					g_print("Error in language file %s: patterns %s and %s in context %d overlap each other\n",
622 							dbstring, keyword, g_array_index(st->matches, Tpattern,
623 												   get_tablerow(st, context, p).match).pattern,
624 							context);
625 					g_free(dbstring);
626 				}
627 				get_tablerow(st, context, p).match = matchnum;
628 			}
629 		} else {
630 			characters[c] = 1;
631 			if (case_insens && c >= 97 && c <= 122)
632 				characters[c - 32] = 1;
633 			create_state_tables(st, context, characters, FALSE, positions, newpositions, end_is_symbol);
634 		}
635 		g_queue_clear(positions);
636 		pointer_switch_addresses((void *) &positions, (void *) &newpositions);
637 		/*tmp = positions;
638 		   positions = newpositions;
639 		   newpositions = tmp; */
640 	}
641 	g_queue_clear(positions);
642 	g_queue_clear(newpositions);
643 	g_queue_free(positions);
644 	g_queue_free(newpositions);
645 	g_free(pattern);
646 }
647 
648 /*
649  * symbols can be free'ed
650  * contexthighlight is directly stored in Tcontext
651  *
652  *
653  */
654 gint16
new_context(Tscantable * st,guint expected_size,const gchar * symbols,const gchar * contexthighlight,gboolean autocomplete_case_insens,gboolean default_spellcheck,gboolean dump_dfa_run)655 new_context(Tscantable * st, guint expected_size, const gchar * symbols, const gchar * contexthighlight,
656 			gboolean autocomplete_case_insens, gboolean default_spellcheck, gboolean dump_dfa_run)
657 {
658 	gint16 context;
659 	gint i;
660 	const gchar *tmp;
661 	GArray *tmptable;
662 
663 	context = st->contexts->len;
664 	g_array_set_size(st->contexts, st->contexts->len + 1);
665 
666 	g_array_index(st->contexts, Tcontext, context).autocomplete_case_insens = autocomplete_case_insens;
667 	g_array_index(st->contexts, Tcontext, context).default_spellcheck = default_spellcheck;
668 	g_array_index(st->contexts, Tcontext, context).dump_dfa_run = dump_dfa_run;
669 	g_array_index(st->contexts, Tcontext, context).contexthighlight = (gchar *) contexthighlight;
670 	tmptable = g_array_index(st->contexts, Tcontext, context).table = g_array_sized_new(TRUE, TRUE, sizeof(Ttablerow), expected_size);
671 	g_array_set_size(g_array_index(st->contexts, Tcontext, context).table, 2); /* first two states are the startstate (0) and the identstate (1) */
672 	/* identstate refers to itself for all characters except the symbols. we cannot use memset
673 	   because an guint16 occupies 2 bytes */
674 	for (i = 0; i < NUMSCANCHARS; i++)
675 		g_array_index(tmptable, Ttablerow, 1).row[i] = 1;
676 
677 	/* 0, \0 or NULL is always a symbol */
678 	g_array_index(tmptable, Ttablerow, 1).row[0] = 0;
679 	tmp = symbols;
680 	while (*tmp) {
681 		/*g_print("mark %c as symbol\n",*tmp); */
682 		g_array_index(tmptable, Ttablerow, 1).row[(int) *tmp] = 0;
683 
684 		if (st->allsymbols[(int) *tmp] == 0)
685 			st->allsymbols[(int) *tmp] = 1;
686 
687 		tmp++;
688 	}
689 	/* now copy the identstate to the startstate, so every symbol in the startstate will point to the identstate */
690 	memcpy(g_array_index(tmptable, Ttablerow, 0).row,
691 		   g_array_index(tmptable, Ttablerow, 1).row, sizeof(guint16[NUMSCANCHARS]));
692 
693 	return context;
694 }
695 
696 void
match_set_nextcontext(Tscantable * st,guint16 matchnum,guint16 nextcontext)697 match_set_nextcontext(Tscantable * st, guint16 matchnum, guint16 nextcontext)
698 {
699 	DBG_PATCOMPILE("match_set_nextcontext, set match %d to have nextcontext %d\n", matchnum, nextcontext);
700 	g_array_index(st->matches, Tpattern, matchnum).nextcontext = nextcontext;
701 }
702 
703 void
match_autocomplete_reference(Tscantable * st,guint16 matchnum,guint16 context)704 match_autocomplete_reference(Tscantable * st, guint16 matchnum, guint16 context)
705 {
706 	gint pattern_id = matchnum;
707 	gboolean has_reference;
708 
709 	has_reference = (g_array_index(st->matches, Tpattern, matchnum).reference!=NULL);
710 
711 	if (!g_array_index(st->contexts, Tcontext, context).patternhash) {
712 		g_array_index(st->contexts, Tcontext, context).patternhash =
713 			g_hash_table_new(g_str_hash, g_str_equal);
714 	}
715 	if (has_reference) {
716 		g_hash_table_insert(g_array_index(st->contexts, Tcontext, context).patternhash,
717 						g_array_index(st->matches, Tpattern, matchnum).pattern, GINT_TO_POINTER(pattern_id));
718 	}
719 
720 	if (g_array_index(st->matches, Tpattern, matchnum).tagclose_from_blockstack
721 		&& !g_array_index(st->contexts, Tcontext, context).has_tagclose_from_blockstack) {
722 		DBG_AUTOCOMP("pattern %d in context %d has tagclose from blockstack!\n", matchnum, context);
723 		g_array_index(st->contexts, Tcontext, context).has_tagclose_from_blockstack = 1;
724 	}
725 
726 	if (g_array_index(st->matches, Tpattern, matchnum).autocomp_items) {
727 		GSList *tmpslist = g_array_index(st->matches, Tpattern, matchnum).autocomp_items;
728 		GList *list = NULL;
729 		if (!g_array_index(st->contexts, Tcontext, context).ac) {
730 			DBG_PATCOMPILE("create g_completion for context %d\n", context);
731 			g_array_index(st->contexts, Tcontext, context).ac = g_completion_new(NULL);
732 			if (g_array_index(st->contexts, Tcontext, context).autocomplete_case_insens)
733 				g_completion_set_compare(g_array_index(st->contexts, Tcontext, context).ac, strncasecmp);
734 			g_array_index(st->contexts, Tcontext, context).autocomplete_has_conditions = 0; /* defaults to 0 */
735 		}
736 		while (tmpslist) {
737 			gboolean already_handled=FALSE;
738 			Tpattern_autocomplete *pac = tmpslist->data;
739 			if (pac->condition != 0) {
740 				g_array_index(st->contexts, Tcontext, context).autocomplete_has_conditions = 1;
741 
742 			/* there is a special situation in Bluefish, if a attribute-context is re-used, a new conditional item
743 			is added to the '>' pattern, and match_autocomplete_reference() is called AGAIN for the same pattern.
744 			we can detect this by looking it up in the hash table  */
745 				if (GPOINTER_TO_INT(g_hash_table_lookup(g_array_index(st->contexts, Tcontext, context).patternhash,pac->autocomplete_string))==pattern_id) {
746 					DBG_AUTOCOMP("match_autocomplete_reference, ignore autocomplete string %s, is already handled\n",pac->autocomplete_string);
747 					already_handled = TRUE;
748 				}
749 			}
750 #ifdef DEVELOPMENT
751 			if (g_strcmp0(pac->autocomplete_string, g_array_index(st->matches, Tpattern, matchnum).pattern)!=0) {
752 				gint hashpat = GPOINTER_TO_INT(g_hash_table_lookup(g_array_index(st->contexts, Tcontext, context).patternhash,pac->autocomplete_string));
753 				if (hashpat && !already_handled) {
754 					g_print("autocompletion item %s is already in the context %d hash table for pattern %d, now handling pattern %d that has %d autocomplete entries?!\n", pac->autocomplete_string, context, hashpat, pattern_id, g_slist_length(g_array_index(st->matches, Tpattern, matchnum).autocomp_items));
755 				}
756 			}
757 #endif
758 			if (!already_handled) {
759 				list = g_list_prepend(list, pac->autocomplete_string);
760 				/* we only need to add an autocomplete string to the hash table if the pattern has a
761 				reference text, or backup_cursor is set, or trigger_new_autocomp_popup, or there
762 				is a condition AND if the autocompletion patern is different than the pattern itself (which is
763 				added to the hash table earlier in this function) */
764 				if (g_strcmp0(pac->autocomplete_string, g_array_index(st->matches, Tpattern, matchnum).pattern)!=0 && (has_reference || pac->condition != 0 || pac->autocomplete_backup_cursor!=0 || pac->trigger_new_autocomp_popup!=0)) {
765 					DBG_AUTOCOMP("match_autocomplete_reference, enter autocompletion item %s in hash table\n",pac->autocomplete_string);
766 					g_hash_table_insert(g_array_index(st->contexts, Tcontext, context).patternhash,
767 									pac->autocomplete_string, GINT_TO_POINTER(pattern_id));
768 				}
769 			}
770 			tmpslist = g_slist_next(tmpslist);
771 		}
772 		g_completion_add_items(g_array_index(st->contexts, Tcontext, context).ac, list);
773 		g_list_free(list);
774 	}
775 }
776 /*
777  * refname should not be freed, it is set in Tpattern_condition
778  *
779  */
780 static guint16
add_condition(Tscantable * st,const gchar * refname,gint relation,gint mode)781 add_condition(Tscantable * st, const gchar *refname, gint relation, gint mode)
782 {
783 	guint16 condnum = st->conditions->len;
784 	DBG_PARSING("add_condition, condnum=%d, called with refname %s\n",condnum,refname);
785 	g_array_set_size(st->conditions, st->conditions->len + 1);
786 	if (mode < 1 || mode > 4) {
787 		g_warning("Error in language file: condition mode %d is not defined\n",mode);
788 		return 0;
789 	}
790 	g_array_index(st->conditions, Tpattern_condition, condnum).refname = g_strdup(refname);
791 	g_array_index(st->conditions, Tpattern_condition, condnum).parentrelation = relation;
792 	g_array_index(st->conditions, Tpattern_condition, condnum).relationtype = mode;
793 	return condnum;
794 }
795 
796 /*
797  * autocomplete_string should be free'ed
798  * autocomplete_append should be free'ed
799  * refname should be free'ed
800  *
801  *
802  */
803 void
match_add_autocomp_item(Tscantable * st,guint16 matchnum,const gchar * autocomplete_string,const gchar * autocomplete_append,guint8 autocomplete_backup_cursor,guint8 trigger_new_autocomp_popup,const gchar * refname,gint relation,gint mode)804 match_add_autocomp_item(Tscantable * st, guint16 matchnum, const gchar * autocomplete_string,
805 						const gchar * autocomplete_append, guint8 autocomplete_backup_cursor,
806 						guint8 trigger_new_autocomp_popup,
807 						const gchar *refname, gint relation, gint mode)
808 {
809 	Tpattern_autocomplete *pac = g_slice_new(Tpattern_autocomplete);
810 	if (autocomplete_string) {
811 		pac->autocomplete_string = g_strdup(autocomplete_string);
812 	} else if (autocomplete_append) {
813 		pac->autocomplete_string =
814 			g_strconcat(g_array_index(st->matches, Tpattern, matchnum).pattern, autocomplete_append, NULL);
815 	} else {
816 		pac->autocomplete_string = g_array_index(st->matches, Tpattern, matchnum).pattern;
817 	}
818 
819 #ifdef DEVELOPMENT
820 	GSList *slist;
821 	slist=g_array_index(st->matches, Tpattern, matchnum).autocomp_items;
822 	while (slist) {
823 		Tpattern_autocomplete *tpac = slist->data;
824 		if (g_strcmp0(pac->autocomplete_string, tpac->autocomplete_string)==0) {
825 			g_print("Warning: string %s exists as autocomplete string for pattern %d\n",pac->autocomplete_string,matchnum);
826 		}
827 		slist = g_slist_next(slist);
828 	}
829 #endif
830 	pac->autocomplete_backup_cursor = autocomplete_backup_cursor;
831 	pac->trigger_new_autocomp_popup = trigger_new_autocomp_popup;
832 	pac->condition = (mode==0) ? 0 : add_condition(st, refname, relation, mode);
833 	g_array_index(st->matches, Tpattern, matchnum).autocomp_items =
834 		g_slist_prepend(g_array_index(st->matches, Tpattern, matchnum).autocomp_items, pac);
835 }
836 
837 void
match_set_reference(Tscantable * st,guint16 matchnum,const gchar * reference)838 match_set_reference(Tscantable * st, guint16 matchnum, const gchar * reference)
839 {
840 	if (reference)
841 		g_array_index(st->matches, Tpattern, matchnum).reference = g_strdup(reference);
842 }
843 
844 void
compile_existing_match(Tscantable * st,guint16 matchnum,gint16 context,gpointer ldb)845 compile_existing_match(Tscantable * st, guint16 matchnum, gint16 context, gpointer ldb)
846 {
847 	DBG_PATCOMPILE("compile existing match %d (%s) in context %d\n", matchnum,
848 				   g_array_index(st->matches, Tpattern, matchnum).pattern, context);
849 	if (g_array_index(st->matches, Tpattern, matchnum).is_regex) {
850 		compile_limitedregex_to_DFA(st, g_array_index(st->matches, Tpattern, matchnum).pattern,
851 									g_array_index(st->matches, Tpattern, matchnum).case_insens, matchnum,
852 									context, ldb);
853 	} else {
854 		compile_keyword_to_DFA(st, g_array_index(st->matches, Tpattern, matchnum).pattern, matchnum, context,
855 							   g_array_index(st->matches, Tpattern, matchnum).case_insens, ldb);
856 	}
857 	match_autocomplete_reference(st, matchnum, context);
858 }
859 
860 void
pattern_set_condition(Tscantable * st,guint16 matchnum,gchar * refname,gint relation,gint mode)861 pattern_set_condition(Tscantable * st, guint16 matchnum, gchar *refname, gint relation, gint mode)
862 {
863 	guint16 condnum = add_condition(st, refname, relation, mode);
864 	DBG_PARSING("pattern %d (%s) now has condition %d\n",matchnum,g_array_index(st->matches, Tpattern, matchnum).pattern,condnum);
865 	g_array_index(st->matches, Tpattern, matchnum).condition = condnum;
866 }
867 
868 void
pattern_set_blockmatch(Tscantable * st,guint16 matchnum,gboolean starts_block,gboolean ends_block,guint blockstartpattern,const gchar * blockhighlight,const gchar * blockname,gboolean foldable)869 pattern_set_blockmatch(Tscantable * st, guint16 matchnum,
870 							gboolean starts_block,
871 							gboolean ends_block,
872 							guint blockstartpattern,
873 							const gchar *blockhighlight,
874 							const gchar *blockname,
875 							gboolean foldable)
876 {
877 	if (starts_block == TRUE && ends_block == TRUE) {
878 		g_warning("Error in language file or Bluefish bug: pattern %s both starts and ends a block\n",
879 						g_array_index(st->matches, Tpattern, matchnum).pattern);
880 	}
881 	if (blockname && !starts_block) {
882 		g_warning("Error in language file or Bluefish bug: block_name %s can only be set on a block start\n", blockname);
883 	}
884 
885 	if (starts_block) {
886 		guint16 blocknum = 0;
887 		if (blockname || blockhighlight) { /* only create a block if we need it */
888 			blocknum = st->blocks->len;
889 			g_array_set_size(st->blocks, st->blocks->len + 1);
890 			g_array_index(st->blocks, Tpattern_block, blocknum).highlight = (gchar *) blockhighlight;
891 			g_array_index(st->blocks, Tpattern_block, blocknum).name = (gchar *) blockname;
892 			g_array_index(st->blocks, Tpattern_block, blocknum).foldable = foldable;
893 			g_array_index(st->blocks, Tpattern_block, blocknum).tag = NULL;
894 		}
895 		g_array_index(st->matches, Tpattern, matchnum).starts_block = 1;
896 		g_array_index(st->matches, Tpattern, matchnum).block = blocknum;
897 	} else if (ends_block) {
898 		g_array_index(st->matches, Tpattern, matchnum).ends_block = 1;
899 	}
900 	g_array_index(st->matches, Tpattern, matchnum).blockstartpattern = blockstartpattern;
901 }
902 
903 void
pattern_set_runtime_properties(Tscantable * st,guint16 matchnum,const gchar * selfhighlight,gint16 nextcontext,gboolean tagclose_from_blockstack,gboolean stretch_blockstart,guint8 identmode,gboolean identjump,gboolean identautocomp)904 pattern_set_runtime_properties(Tscantable * st, guint16 matchnum,
905 								const gchar * selfhighlight,
906 								gint16 nextcontext,
907 								gboolean tagclose_from_blockstack,
908 								gboolean stretch_blockstart,
909 								guint8 identmode,
910 								gboolean identjump,
911 								gboolean identautocomp)
912 {
913 	g_array_index(st->matches, Tpattern, matchnum).selfhighlight = (gchar *) selfhighlight;
914 	g_array_index(st->matches, Tpattern, matchnum).nextcontext = nextcontext;
915 	g_array_index(st->matches, Tpattern, matchnum).tagclose_from_blockstack = tagclose_from_blockstack;
916 	g_array_index(st->matches, Tpattern, matchnum).stretch_blockstart = stretch_blockstart;
917 	if (identjump || identautocomp) {
918 		g_array_index(st->matches, Tpattern, matchnum).identmode = identmode;
919 		g_array_index(st->matches, Tpattern, matchnum).identaction = (identjump?1:0) | (identautocomp?2:0);
920 	}
921 }
922 
923 guint16
add_pattern_to_scanning_table(Tscantable * st,const gchar * pattern,gboolean is_regex,gboolean case_insens,gint16 context,gpointer ldb)924 add_pattern_to_scanning_table(Tscantable * st, const gchar * pattern,
925 								gboolean is_regex,
926 								gboolean case_insens,
927 								gint16 context, gpointer ldb)
928 {
929 	guint16 matchnum;
930 	if (!pattern || pattern[0] == '\0') {
931 		g_warning("corrupt language file: found empty pattern. Results are undefined\n");
932 		return 0;
933 	}
934 	matchnum = st->matches->len;
935 	if (matchnum == G_MAXUINT16) {
936 		g_warning("Language file has too many patterns, this will very likely result in a crash!\n");
937 	}
938 	g_array_set_size(st->matches, st->matches->len + 1);
939 	g_array_index(st->matches, Tpattern, matchnum).pattern = g_strdup(pattern);
940 	g_array_index(st->matches, Tpattern, matchnum).case_insens = case_insens;
941 	g_array_index(st->matches, Tpattern, matchnum).is_regex = is_regex;
942 	DBG_PATCOMPILE("add_pattern_to_scanning_table,pattern=%s for context=%d got matchnum %d\n",pattern, context, matchnum);
943 	if (is_regex) {
944 		compile_limitedregex_to_DFA(st, g_array_index(st->matches, Tpattern, matchnum).pattern, case_insens, matchnum, context, ldb);
945 	} else {
946 		compile_keyword_to_DFA(st, g_array_index(st->matches, Tpattern, matchnum).pattern, matchnum, context, case_insens, ldb);
947 	}
948 	/*if (g_strcmp0(pattern, "rem ")==0 || g_strcmp0(pattern, "\\.?[a-zA-Z][a-zA-Z_0-9]*[\\$%]?")==0) {
949 		print_DFA_subset(st, context, "rem var");
950 	}*/
951 	return matchnum;
952 }
953 
954 
955 /*guint16
956 add_keyword_to_scanning_table(Tscantable * st, gchar * pattern, const gchar * lang,
957 							  const gchar * selfhighlight, const gchar * blockhighlight, gboolean is_regex,
958 							  gboolean case_insens, gint16 context, gint16 nextcontext, gboolean starts_block,
959 							  gboolean ends_block, guint blockstartpattern,
960 							  gboolean tagclose_from_blockstack, gboolean stretch_blockstart,
961 							  guint8 identmode, gboolean identjump, gboolean identautocomp)
962 {
963 	guint16 matchnum;
964 
965 	if (!pattern || pattern[0] == '\0') {
966 		g_print("CORRUPT LANGUAGE FILE: empty pattern/tag/keyword\n");
967 		return 0;
968 	}
969 
970 	if (context == nextcontext)
971 		DBG_PATCOMPILE("context=nextcontext=%d for %s\n", context, pattern);
972 
973 	matchnum = st->matches->len;
974 	DBG_BLOCKMATCH("new match %s at matchnum %d has blockstartpattern %d and nextcontext %d\n", pattern,
975 				   matchnum, blockstartpattern, nextcontext);
976 	g_array_set_size(st->matches, st->matches->len + 1);
977 
978 	g_array_index(st->matches, Tpattern, matchnum).pattern = g_strdup(pattern);
979 	g_array_index(st->matches, Tpattern, matchnum).ends_block = ends_block;
980 	g_array_index(st->matches, Tpattern, matchnum).starts_block = starts_block;
981 	g_array_index(st->matches, Tpattern, matchnum).blockstartpattern = blockstartpattern;
982 	g_array_index(st->matches, Tpattern, matchnum).blockhighlight = (gchar *) blockhighlight;
983 	g_array_index(st->matches, Tpattern, matchnum).nextcontext = nextcontext;
984 	g_array_index(st->matches, Tpattern, matchnum).case_insens = case_insens;
985 	g_array_index(st->matches, Tpattern, matchnum).is_regex = is_regex;
986 	g_array_index(st->matches, Tpattern, matchnum).selfhighlight = (gchar *) selfhighlight;
987 
988 	g_array_index(st->matches, Tpattern, matchnum).tagclose_from_blockstack = tagclose_from_blockstack;
989 	g_array_index(st->matches, Tpattern, matchnum).stretch_blockstart = stretch_blockstart;
990 #ifdef IDENTSTORING
991 	if (identjump || identautocomp) {
992 		g_array_index(st->matches, Tpattern, matchnum).identmode = identmode;
993 		g_array_index(st->matches, Tpattern, matchnum).identaction = (identjump?1:0) | (identautocomp?2:0);
994 	}
995 #endif
996 	DBG_PATCOMPILE
997 		("add_keyword_to_scanning_table,pattern=%s,starts_block=%d,ends_block=%d,blockstartpattern=%d, context=%d,nextcontext=%d and got matchnum %d\n",
998 		 pattern, starts_block, ends_block, blockstartpattern, context, nextcontext, matchnum);
999 
1000 	if (is_regex) {
1001 		compile_limitedregex_to_DFA(st, pattern, case_insens, matchnum, context);
1002 	} else {
1003 		compile_keyword_to_DFA(st, pattern, matchnum, context, case_insens);
1004 	}
1005 	/ *print_DFA(st, 'a', 'z'); * /
1006 	return matchnum;
1007 }*/
1008 
1009 void
print_DFA_subset(Tscantable * st,gint16 context,char * chars)1010 print_DFA_subset(Tscantable * st, gint16 context, char *chars)
1011 {
1012 	gint i, j, len;
1013 	len = strlen(chars);
1014 	g_print("***************** print subset of DFA table for context %d\n",(gint)context);
1015 	g_print("       ");
1016 	for (j = 0; j < len; j++) {
1017 		switch(chars[j]) {
1018 			case '\n':
1019 				g_print(" \\n   ");
1020 				break;
1021 			case '\r':
1022 				g_print(" \\r   ");
1023 				break;
1024 			case '\t':
1025 				g_print(" \\t   ");
1026 				break;
1027 			default:
1028 				g_print(" '%c' ", chars[j]);
1029 		}
1030 	}
1031 	g_print(": match\n");
1032 	for (i = 0; i < g_array_index(st->contexts, Tcontext, context).table->len; i++) {
1033 		g_print("%4d: ", i);
1034 		for (j = 0; j < len; j++) {
1035 			g_print("%4d ", g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).row[(gshort) chars[j]]);
1036 		}
1037 		g_print(" : %4d", g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match);
1038 		if (i==0) {
1039 			g_print(" 	this is the startstate");
1040 		} else if (i==1) {
1041 			g_print(" 	this is the identstate");
1042 		}
1043 		if (g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match > 0) {
1044 			g_print(" %s",
1045 					g_array_index(st->matches, Tpattern,
1046 								  g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match).pattern);
1047 			if (g_array_index(st->matches, Tpattern, g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match).nextcontext
1048 				> 0) {
1049 				g_print(" 	--> goto context %d",
1050 						g_array_index(st->matches, Tpattern,
1051 									  g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match).nextcontext);
1052 			} else
1053 				if (g_array_index
1054 					(st->matches, Tpattern, g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match).nextcontext < 0) {
1055 				g_print(" 	--> pop context: %d",
1056 						g_array_index(st->matches, Tpattern,
1057 									  g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match).nextcontext);
1058 			}
1059 		}
1060 		g_print("\n");
1061 	}
1062 	g_print("*****************\n");
1063 }
1064 #ifdef ENABLE_PRINT_DFA
1065 void
print_DFA(Tscantable * st,gint16 context,char start,char end)1066 print_DFA(Tscantable * st, gint16 context, char start, char end)
1067 {
1068 	gint i, j;
1069 	g_print("    ");
1070 	for (j = start; j <= end; j++) {
1071 		g_print("  %c ", j);
1072 	}
1073 	g_print(": match\n");
1074 	for (i = 0; i < g_array_index(st->contexts, Tcontext, context).table->len; i++) {
1075 		g_print("%3d: ", i);
1076 		for (j = start; j <= end; j++) {
1077 			g_print("%3d ", g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).row[j]);
1078 		}
1079 		g_print(": %3d (%s)\n", g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match,
1080 					g_array_index(st->matches, Tpattern,g_array_index(g_array_index(st->contexts, Tcontext, context).table, Ttablerow, i).match).pattern);
1081 	}
1082 
1083 }
1084 #endif /* ENABLE_PRINT_DFA */
1085 
1086 Tscantable *
scantable_new(guint size_table,guint size_matches,guint size_contexts)1087 scantable_new(guint size_table, guint size_matches, guint size_contexts)
1088 {
1089 	Tscantable *st;
1090 	st = g_slice_new0(Tscantable);
1091 	DBG_PARSING("scantable_new, initial size table %d, contexts %d, matches %d\n", size_table, size_matches,
1092 				size_contexts);
1093 	st->contexts = g_array_sized_new(TRUE, TRUE, sizeof(Tcontext), size_contexts);
1094 	st->matches = g_array_sized_new(TRUE, TRUE, sizeof(Tpattern), size_matches);
1095 	st->comments = g_array_sized_new(TRUE, FALSE, sizeof(Tcomment), 8);
1096 	st->blocks = g_array_sized_new(TRUE, FALSE, sizeof(Tpattern_block), 8);
1097 	st->conditions = g_array_sized_new(TRUE, FALSE, sizeof(Tpattern_block), 8);
1098 	st->matches->len = 1;		/* match 0 means no match */
1099 	st->contexts->len = 1;		/* a match with nextcontext 0 means no context change, so we cannot use context 0 */
1100 	st->blocks->len = 1;			/* block 0 means no block */
1101 	st->conditions->len = 1;
1102 	return st;
1103 }
1104 
1105