1 %top {
2 /* This file is part of GNU Dico.
3    Copyright (C) 2012-2020 Sergey Poznyakoff
4 
5    GNU Dico is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    GNU Dico is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with GNU Dico.  If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include <config.h>
19 #include <dico.h>
20 #include <unistd.h>
21 #include <getopt.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <errno.h>
25 #include <sysexits.h>
26 #include <setjmp.h>
27 #include <appi18n.h>
28 #include "gcide.h"
29 #include "wordsplit.h"
30 
31 #define yy_create_buffer      gcide_markup_yy_create_buffer
32 #define yy_delete_buffer      gcide_markup_yy_delete_buffer
33 #define yy_flex_debug	      gcide_markup_yy_flex_debug
34 #define yy_init_buffer	      gcide_markup_yy_init_buffer
35 #define yy_flush_buffer	      gcide_markup_yy_flush_buffer
36 #define yy_load_buffer_state  gcide_markup_yy_load_buffer_state
37 #define yy_switch_to_buffer   gcide_markup_yy_switch_to_buffer
38 #define yyin		      gcide_markup_yyin
39 #define yyleng		      gcide_markup_yyleng
40 #define yylex		      gcide_markup_yylex
41 #define yylineno	      gcide_markup_yylineno
42 #define yyout		      gcide_markup_yyout
43 #define yyrestart	      gcide_markup_yyrestart
44 #define yytext		      gcide_markup_yytext
45 #define yywrap		      gcide_markup_yywrap
46 #define yyalloc		      gcide_markup_yyalloc
47 #define yyrealloc	      gcide_markup_yyrealloc
48 #define yyfree                gcide_markup_yyfree
49 #define yyunput               gcide_markup_yyunput
50 
51 #define yylex_destroy         gcide_markup_yylex_destroy
52 #define yyget_debug	      gcide_markup_yyget_debug
53 #define yyset_debug	      gcide_markup_yyset_debug
54 #define yyget_extra	      gcide_markup_yyget_extra
55 #define yyset_extra	      gcide_markup_yyset_extra
56 #define yyget_in	      gcide_markup_yyget_in
57 #define yyset_in	      gcide_markup_yyset_in
58 #define yyget_out	      gcide_markup_yyget_out
59 #define yyset_out	      gcide_markup_yyset_out
60 #define yyget_leng	      gcide_markup_yyget_leng
61 #define yyget_text	      gcide_markup_yyget_text
62 #define yyget_lineno	      gcide_markup_yyget_lineno
63 #define yyset_lineno	      gcide_markup_yyset_lineno
64 
65 }
66 %{
67 
68 static char const *input_buf;
69 static size_t input_len;
70 static unsigned token_beg;
71 static unsigned token_end;
72 
73 static char *textspace;  /* Text storage space */
74 static size_t textsize;  /* Size of text space */
75 static size_t textpos;   /* Current position in the text space */
76 static size_t textstart; /* Start of the current text segment */
77 
78 static dico_list_t tagstk;
79 static struct gcide_tag *current_tag;
80 
81 static jmp_buf errbuf;
82 
83 static void
memerr(const char * text)84 memerr(const char *text)
85 {
86     dico_log(L_ERR, ENOMEM, "%s", text);
87     longjmp(errbuf, 1);
88 }
89 
90 #define YY_USER_ACTION do {						\
91 	token_beg = token_end;						\
92 	token_end += yyleng;						\
93     } while (0);
94 #define YY_INPUT(buf,result,max_size) do {				\
95 	size_t __n = (max_size) > input_len ? input_len : (max_size);	\
96 	if (__n)							\
97 	    memcpy((buf), input_buf, __n);				\
98 	input_len -= __n;						\
99 	(result) = __n;							\
100     } while(0)
101 
102 static int retstate;
103 static char *endtag;
104 #define BEGIN_COMMENT(end) \
105     { retstate = YYSTATE; endtag = (end); BEGIN(COMMENT); }
106 
107 static void
text_add_str(char const * s,size_t l)108 text_add_str(char const *s, size_t l)
109 {
110     size_t rest = textsize - textpos;
111     if (rest < l) {
112 	size_t nsize = 2 * textsize;
113 	char *newp = realloc(textspace, nsize);
114 	if (!newp)
115 	    memerr("text_add");
116 	textspace = newp;
117 	textsize = nsize;
118     }
119     memcpy(textspace + textpos, s, l);
120     textpos += l;
121 }
122 
123 static void
text_add_chr(int ch)124 text_add_chr(int ch)
125 {
126     char c = ch;
127     text_add_str(&c, 1);
128 }
129 
130 #define text_segment_length() (textpos - textstart)
131 
132 static size_t
text_segment_finish()133 text_segment_finish()
134 {
135     size_t ret = textstart;
136     text_add_chr(0);
137     textstart = textpos;
138     return ret;
139 }
140 
141 static struct gcide_tag *
alloc_tag(const char * text,size_t len)142 alloc_tag(const char *text, size_t len)
143 {
144     struct gcide_tag *tag;
145     struct wordsplit ws;
146 
147     if (wordsplit_len(text, len, &ws, WRDSF_DEFFLAGS & ~WRDSF_CESCAPES)) {
148 	dico_log(L_ERR, 0, _("cannot parse line %.*s: %s"),
149 		 (int)len, text, wordsplit_strerror(&ws));
150 	longjmp(errbuf, 1);
151     }
152     tag = calloc(1, sizeof(*tag));
153     if (tag) {
154 	tag->tag_parmc = ws.ws_wordc;
155 	tag->tag_parmv = ws.ws_wordv;
156 	ws.ws_wordc = 0;
157 	ws.ws_wordv = NULL;
158     } else
159 	memerr("gcide alloc_tag");
160     wordsplit_free(&ws);
161     return tag;
162 }
163 
164 static int
free_tag(void * item,void * data)165 free_tag(void *item, void *data)
166 {
167     struct gcide_tag *tag = item;
168 
169     if (!tag)
170 	return 0;
171     switch (tag->tag_type) {
172     case gcide_content_unspecified:
173     case gcide_content_text:
174 	break;
175     case gcide_content_taglist:
176 	dico_list_destroy(&tag->tag_v.taglist);
177     }
178     return 0;
179 }
180 
181 static void
append_tag(struct gcide_tag * tag)182 append_tag(struct gcide_tag *tag)
183 {
184     dico_list_t list;
185 
186     switch (current_tag->tag_type) {
187     case gcide_content_unspecified:
188 	list = dico_list_create();
189 	if (!list) {
190 	    free(tag);
191 	    memerr("append_tag");
192 	}
193 	dico_list_set_free_item(list, free_tag, NULL);
194 	current_tag->tag_type = gcide_content_taglist;
195 	current_tag->tag_v.taglist = list;
196 	break;
197 
198     case gcide_content_text: {
199 	struct gcide_tag *subtag = calloc(1, sizeof(*tag));
200 
201 	if (!subtag)
202 	    memerr("append_tag");
203 	subtag->tag_type = gcide_content_text;
204 	subtag->tag_v.textpos = current_tag->tag_v.textpos;
205 	list = dico_list_create();
206 	if (!list) {
207 	    free(subtag);
208 	    free(tag);
209 	    memerr("append_tag");
210 	}
211 	dico_list_set_free_item(list, free_tag, NULL);
212 	dico_list_append(list, subtag);
213 	current_tag->tag_type = gcide_content_taglist;
214 	current_tag->tag_v.taglist = list;
215 	break;
216     }
217 
218     case gcide_content_taglist:
219 	break;
220     }
221 
222     dico_list_append(current_tag->tag_v.taglist, tag);
223 }
224 
225 static int in_grk;
226 
227 static void
push_tag(struct gcide_tag * tag)228 push_tag(struct gcide_tag *tag)
229 {
230     append_tag(tag);
231     dico_list_push(tagstk, current_tag);
232     current_tag = tag;
233     in_grk =
234 	current_tag->tag_parmc && strcmp(current_tag->tag_name, "grk") == 0;
235 }
236 
237 static void
pop_tag(const char * tagstr,size_t taglen)238 pop_tag(const char *tagstr, size_t taglen)
239 {
240     size_t len;
241 
242     for (len = 0; len < taglen; len++)
243 	if (tagstr[len] == ' ' || tagstr[len] == '\t')
244 	    break;
245 
246     if (len == 3 && memcmp(tagstr, "grk", 3) == 0)
247 	in_grk = 0;
248 
249     if (TAG_HAS_NAME(current_tag) &&
250 	strlen(current_tag->tag_name) == len &&
251 	memcmp(current_tag->tag_name, tagstr, len) == 0)
252 	current_tag = dico_list_pop(tagstk);
253     else
254 	dico_log(L_WARN, 0, "%u: unexpected close tag", token_beg);
255 }
256 %}
257 
258 %option 8bit
259 %option nounput
260 %option noinput
261 
262 %x COMMENT
263 
264 XD [0-9a-f]
265 %%
266 <INITIAL>{
267   "<--"    BEGIN_COMMENT("-->");
268   "<!"     BEGIN_COMMENT("!>");
269   "<p>"|"</p>" ;
270   "<"[a-zA-Z][^/>]*">" {
271       struct gcide_tag *tag;
272 
273       if (text_segment_length()) {
274 	  tag = calloc(1, sizeof(*tag));
275 	  if (!tag)
276 	      memerr("gcide lexer");
277 	  tag->tag_type = gcide_content_text;
278 	  tag->tag_v.textpos = text_segment_finish();
279 	  append_tag(tag);
280       }
281 
282       push_tag(alloc_tag(yytext + 1, yyleng - 2));
283   }
284   "</"[a-zA-Z][^>]*">" {
285       if (text_segment_length()) {
286 	  struct gcide_tag *tag = calloc(1, sizeof(*tag));
287 	  if (!tag)
288 	      memerr("gcide lexer");
289 	  tag->tag_type = gcide_content_text;
290 	  tag->tag_v.textpos = text_segment_finish();
291 	  append_tag(tag);
292       }
293       pop_tag(yytext + 2, yyleng - 3);
294   }
295   "<"[a-zA-Z?][a-zA-Z0-9]*"/" {
296       char const *s = gcide_entity_to_utf8(yytext);
297       if (s)
298 	  text_add_str(s, strlen(s));
299       else
300 	  dico_log(L_WARN, 0, _("%u: unrecognized entity: %s"),
301 		   token_beg, yytext);
302   }
303   [""*`]   { if (in_grk)  text_add_chr(yytext[0]); }
304   "\\'"{XD}{XD} {
305       char const *s = gcide_escape_to_utf8(yytext+2);
306 
307       if (s)
308 	  text_add_str(s, strlen(s));
309       else {
310 	  text_add_str(yytext, yyleng);
311 	  dico_log(L_WARN, 0,
312 		   _("%u: unknown character sequence %s"),
313 		   token_beg, yytext);
314       }
315   }
316   \r    ;
317   .     text_add_str(yytext, yyleng);
318   \n    text_add_str(yytext, yyleng);
319 }
320 <COMMENT>{
321   [^< \t][^ \t>]*">" {
322       if (strcmp(yytext, endtag) == 0)
323 	  BEGIN(retstate);
324   }
325   .          ;
326   \n         ;
327 }
328 %%
329 
330 int
331 yywrap()
332 {
333     return 1;
334 }
335 
336 struct walk_closure {
337     int (*fun)(int, struct gcide_tag *, void *);
338     void *data;
339 };
340 
341 static int
inorder_helper(void * item,void * data)342 inorder_helper(void *item, void *data)
343 {
344     struct gcide_tag *tag = item;
345     struct walk_closure *cp = data;
346 
347     if (cp->fun(0, tag, cp->data))
348 	return 1;
349     if (tag->tag_type == gcide_content_taglist) {
350 	dico_list_iterate(tag->tag_v.taglist, inorder_helper, data);
351 	if (cp->fun(1, tag, cp->data))
352 	    return 1;
353     }
354     return 0;
355 }
356 
357 int
gcide_parse_tree_inorder(struct gcide_parse_tree * tp,int (* fun)(int,struct gcide_tag *,void *),void * data)358 gcide_parse_tree_inorder(struct gcide_parse_tree *tp,
359 			 int (*fun)(int, struct gcide_tag *, void *),
360 			 void *data)
361 {
362     struct walk_closure clos;
363     clos.fun = fun;
364     clos.data = data;
365     return inorder_helper(tp->root, &clos);
366 }
367 
368 
369 static int
tag_fixup(void * item,void * data)370 tag_fixup(void *item, void *data)
371 {
372     struct gcide_tag *tag = item;
373     char *textspace = data;
374 
375     switch (tag->tag_type) {
376     case gcide_content_unspecified:
377 	break;
378     case gcide_content_text:
379 	tag->tag_v.text = textspace + tag->tag_v.textpos;
380 	break;
381     case gcide_content_taglist:
382 	dico_list_iterate(tag->tag_v.taglist, tag_fixup, textspace);
383 	break;
384     }
385     return 0;
386 }
387 
388 static size_t
greek_translit(size_t n)389 greek_translit(size_t n)
390 {
391     while (textspace[n]) {
392 	size_t rd;
393 	const char *greek = gcide_grk_to_utf8(textspace + n, &rd);
394 
395 	if (greek) {
396 	    text_add_str(greek, strlen(greek));
397 	    n += rd;
398 	} else {
399 	    text_add_chr(textspace[n]);
400 	    n++;
401 	}
402     }
403     return text_segment_finish();
404 }
405 
406 static int
early_fixup(void * item,void * data)407 early_fixup(void *item, void *data)
408 {
409     struct gcide_tag *tag = item;
410     int translate = data ? *(int*)data : 0;
411 
412     switch (tag->tag_type) {
413     case gcide_content_unspecified:
414 	break;
415     case gcide_content_text:
416 	if (translate)
417 	    tag->tag_v.textpos = greek_translit(tag->tag_v.textpos);
418 	break;
419     case gcide_content_taglist:
420 	translate = (tag->tag_parmc && strcmp(tag->tag_name, "grk") == 0);
421 	dico_list_iterate(tag->tag_v.taglist, early_fixup, &translate);
422 	break;
423     }
424     return 0;
425 }
426 
427 struct gcide_parse_tree *
gcide_markup_parse(char const * text,size_t len,int dbg)428 gcide_markup_parse(char const *text, size_t len, int dbg)
429 {
430     struct gcide_parse_tree *tp;
431     struct gcide_tag *p;
432 
433     input_buf = text;
434     input_len = len;
435     token_beg = token_end = 0;
436 
437     if (setjmp(errbuf)) {
438 	free(textspace);
439 	dico_list_destroy(&tagstk);
440 	free_tag(current_tag, NULL);
441     }
442 
443     textsize = 2 * len;
444     textspace = malloc(textsize);
445     if (!textspace)
446 	memerr("gcide_markup_parse");
447     textpos = textstart = 0;
448 
449     tagstk = dico_list_create();
450     if (!tagstk)
451 	memerr("gcide_markup_parse");
452     dico_list_set_free_item(tagstk, free_tag, NULL);
453     current_tag = calloc(1, sizeof(*current_tag));
454     if (!current_tag)
455 	memerr("gcide_markup_parse");
456 
457     yy_flex_debug = dbg;
458     BEGIN(INITIAL);
459     while (yylex ())
460         ;
461 
462     /* Append trailing text segment, if any */
463     if (text_segment_length()) {
464 	struct gcide_tag *tag = calloc(1, sizeof(*tag));
465 	if (!tag)
466 	    memerr("gcide lexer");
467 	tag->tag_type = gcide_content_text;
468 	tag->tag_v.textpos = text_segment_finish();
469 	append_tag(tag);
470     }
471 
472     while ((p = dico_list_pop(tagstk)))
473 	/* FIXME: Report unclosed tag */
474 	current_tag = p;
475 
476     dico_list_destroy(&tagstk);
477 
478     if (!current_tag) {
479 	free(textspace);
480 	return NULL;
481     }
482 
483     early_fixup(current_tag, NULL);
484 
485     tp = malloc(sizeof(*tp));
486     if (!tp)
487 	memerr("gcide_markup_parse");
488 
489     tp->textspace = textspace;
490     tp->textsize = textsize;
491 
492     tp->root = current_tag;
493 
494     tag_fixup(tp->root, textspace);
495 
496     return tp;
497 }
498 
499 void
gcide_parse_tree_free(struct gcide_parse_tree * tp)500 gcide_parse_tree_free(struct gcide_parse_tree *tp)
501 {
502     free(tp->textspace);
503     free_tag(tp->root, NULL);
504     free(tp);
505 }
506 
507