1 %top {
2 /* This file is part of GNU Dico.
3 Copyright (C) 2012-2020 Sergey Poznyakoff
4
5 GNU Dico is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GNU Dico is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GNU Dico. If not, see <http://www.gnu.org/licenses/>. */
17
18 #include <config.h>
19 #include <dico.h>
20 #include <unistd.h>
21 #include <getopt.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <errno.h>
25 #include <sysexits.h>
26 #include <setjmp.h>
27 #include <appi18n.h>
28 #include "gcide.h"
29 #include "wordsplit.h"
30
31 #define yy_create_buffer gcide_markup_yy_create_buffer
32 #define yy_delete_buffer gcide_markup_yy_delete_buffer
33 #define yy_flex_debug gcide_markup_yy_flex_debug
34 #define yy_init_buffer gcide_markup_yy_init_buffer
35 #define yy_flush_buffer gcide_markup_yy_flush_buffer
36 #define yy_load_buffer_state gcide_markup_yy_load_buffer_state
37 #define yy_switch_to_buffer gcide_markup_yy_switch_to_buffer
38 #define yyin gcide_markup_yyin
39 #define yyleng gcide_markup_yyleng
40 #define yylex gcide_markup_yylex
41 #define yylineno gcide_markup_yylineno
42 #define yyout gcide_markup_yyout
43 #define yyrestart gcide_markup_yyrestart
44 #define yytext gcide_markup_yytext
45 #define yywrap gcide_markup_yywrap
46 #define yyalloc gcide_markup_yyalloc
47 #define yyrealloc gcide_markup_yyrealloc
48 #define yyfree gcide_markup_yyfree
49 #define yyunput gcide_markup_yyunput
50
51 #define yylex_destroy gcide_markup_yylex_destroy
52 #define yyget_debug gcide_markup_yyget_debug
53 #define yyset_debug gcide_markup_yyset_debug
54 #define yyget_extra gcide_markup_yyget_extra
55 #define yyset_extra gcide_markup_yyset_extra
56 #define yyget_in gcide_markup_yyget_in
57 #define yyset_in gcide_markup_yyset_in
58 #define yyget_out gcide_markup_yyget_out
59 #define yyset_out gcide_markup_yyset_out
60 #define yyget_leng gcide_markup_yyget_leng
61 #define yyget_text gcide_markup_yyget_text
62 #define yyget_lineno gcide_markup_yyget_lineno
63 #define yyset_lineno gcide_markup_yyset_lineno
64
65 }
66 %{
67
68 static char const *input_buf;
69 static size_t input_len;
70 static unsigned token_beg;
71 static unsigned token_end;
72
73 static char *textspace; /* Text storage space */
74 static size_t textsize; /* Size of text space */
75 static size_t textpos; /* Current position in the text space */
76 static size_t textstart; /* Start of the current text segment */
77
78 static dico_list_t tagstk;
79 static struct gcide_tag *current_tag;
80
81 static jmp_buf errbuf;
82
83 static void
memerr(const char * text)84 memerr(const char *text)
85 {
86 dico_log(L_ERR, ENOMEM, "%s", text);
87 longjmp(errbuf, 1);
88 }
89
90 #define YY_USER_ACTION do { \
91 token_beg = token_end; \
92 token_end += yyleng; \
93 } while (0);
94 #define YY_INPUT(buf,result,max_size) do { \
95 size_t __n = (max_size) > input_len ? input_len : (max_size); \
96 if (__n) \
97 memcpy((buf), input_buf, __n); \
98 input_len -= __n; \
99 (result) = __n; \
100 } while(0)
101
102 static int retstate;
103 static char *endtag;
104 #define BEGIN_COMMENT(end) \
105 { retstate = YYSTATE; endtag = (end); BEGIN(COMMENT); }
106
107 static void
text_add_str(char const * s,size_t l)108 text_add_str(char const *s, size_t l)
109 {
110 size_t rest = textsize - textpos;
111 if (rest < l) {
112 size_t nsize = 2 * textsize;
113 char *newp = realloc(textspace, nsize);
114 if (!newp)
115 memerr("text_add");
116 textspace = newp;
117 textsize = nsize;
118 }
119 memcpy(textspace + textpos, s, l);
120 textpos += l;
121 }
122
123 static void
text_add_chr(int ch)124 text_add_chr(int ch)
125 {
126 char c = ch;
127 text_add_str(&c, 1);
128 }
129
130 #define text_segment_length() (textpos - textstart)
131
132 static size_t
text_segment_finish()133 text_segment_finish()
134 {
135 size_t ret = textstart;
136 text_add_chr(0);
137 textstart = textpos;
138 return ret;
139 }
140
141 static struct gcide_tag *
alloc_tag(const char * text,size_t len)142 alloc_tag(const char *text, size_t len)
143 {
144 struct gcide_tag *tag;
145 struct wordsplit ws;
146
147 if (wordsplit_len(text, len, &ws, WRDSF_DEFFLAGS & ~WRDSF_CESCAPES)) {
148 dico_log(L_ERR, 0, _("cannot parse line %.*s: %s"),
149 (int)len, text, wordsplit_strerror(&ws));
150 longjmp(errbuf, 1);
151 }
152 tag = calloc(1, sizeof(*tag));
153 if (tag) {
154 tag->tag_parmc = ws.ws_wordc;
155 tag->tag_parmv = ws.ws_wordv;
156 ws.ws_wordc = 0;
157 ws.ws_wordv = NULL;
158 } else
159 memerr("gcide alloc_tag");
160 wordsplit_free(&ws);
161 return tag;
162 }
163
164 static int
free_tag(void * item,void * data)165 free_tag(void *item, void *data)
166 {
167 struct gcide_tag *tag = item;
168
169 if (!tag)
170 return 0;
171 switch (tag->tag_type) {
172 case gcide_content_unspecified:
173 case gcide_content_text:
174 break;
175 case gcide_content_taglist:
176 dico_list_destroy(&tag->tag_v.taglist);
177 }
178 return 0;
179 }
180
181 static void
append_tag(struct gcide_tag * tag)182 append_tag(struct gcide_tag *tag)
183 {
184 dico_list_t list;
185
186 switch (current_tag->tag_type) {
187 case gcide_content_unspecified:
188 list = dico_list_create();
189 if (!list) {
190 free(tag);
191 memerr("append_tag");
192 }
193 dico_list_set_free_item(list, free_tag, NULL);
194 current_tag->tag_type = gcide_content_taglist;
195 current_tag->tag_v.taglist = list;
196 break;
197
198 case gcide_content_text: {
199 struct gcide_tag *subtag = calloc(1, sizeof(*tag));
200
201 if (!subtag)
202 memerr("append_tag");
203 subtag->tag_type = gcide_content_text;
204 subtag->tag_v.textpos = current_tag->tag_v.textpos;
205 list = dico_list_create();
206 if (!list) {
207 free(subtag);
208 free(tag);
209 memerr("append_tag");
210 }
211 dico_list_set_free_item(list, free_tag, NULL);
212 dico_list_append(list, subtag);
213 current_tag->tag_type = gcide_content_taglist;
214 current_tag->tag_v.taglist = list;
215 break;
216 }
217
218 case gcide_content_taglist:
219 break;
220 }
221
222 dico_list_append(current_tag->tag_v.taglist, tag);
223 }
224
225 static int in_grk;
226
227 static void
push_tag(struct gcide_tag * tag)228 push_tag(struct gcide_tag *tag)
229 {
230 append_tag(tag);
231 dico_list_push(tagstk, current_tag);
232 current_tag = tag;
233 in_grk =
234 current_tag->tag_parmc && strcmp(current_tag->tag_name, "grk") == 0;
235 }
236
237 static void
pop_tag(const char * tagstr,size_t taglen)238 pop_tag(const char *tagstr, size_t taglen)
239 {
240 size_t len;
241
242 for (len = 0; len < taglen; len++)
243 if (tagstr[len] == ' ' || tagstr[len] == '\t')
244 break;
245
246 if (len == 3 && memcmp(tagstr, "grk", 3) == 0)
247 in_grk = 0;
248
249 if (TAG_HAS_NAME(current_tag) &&
250 strlen(current_tag->tag_name) == len &&
251 memcmp(current_tag->tag_name, tagstr, len) == 0)
252 current_tag = dico_list_pop(tagstk);
253 else
254 dico_log(L_WARN, 0, "%u: unexpected close tag", token_beg);
255 }
256 %}
257
258 %option 8bit
259 %option nounput
260 %option noinput
261
262 %x COMMENT
263
264 XD [0-9a-f]
265 %%
266 <INITIAL>{
267 "<--" BEGIN_COMMENT("-->");
268 "<!" BEGIN_COMMENT("!>");
269 "<p>"|"</p>" ;
270 "<"[a-zA-Z][^/>]*">" {
271 struct gcide_tag *tag;
272
273 if (text_segment_length()) {
274 tag = calloc(1, sizeof(*tag));
275 if (!tag)
276 memerr("gcide lexer");
277 tag->tag_type = gcide_content_text;
278 tag->tag_v.textpos = text_segment_finish();
279 append_tag(tag);
280 }
281
282 push_tag(alloc_tag(yytext + 1, yyleng - 2));
283 }
284 "</"[a-zA-Z][^>]*">" {
285 if (text_segment_length()) {
286 struct gcide_tag *tag = calloc(1, sizeof(*tag));
287 if (!tag)
288 memerr("gcide lexer");
289 tag->tag_type = gcide_content_text;
290 tag->tag_v.textpos = text_segment_finish();
291 append_tag(tag);
292 }
293 pop_tag(yytext + 2, yyleng - 3);
294 }
295 "<"[a-zA-Z?][a-zA-Z0-9]*"/" {
296 char const *s = gcide_entity_to_utf8(yytext);
297 if (s)
298 text_add_str(s, strlen(s));
299 else
300 dico_log(L_WARN, 0, _("%u: unrecognized entity: %s"),
301 token_beg, yytext);
302 }
303 [""*`] { if (in_grk) text_add_chr(yytext[0]); }
304 "\\'"{XD}{XD} {
305 char const *s = gcide_escape_to_utf8(yytext+2);
306
307 if (s)
308 text_add_str(s, strlen(s));
309 else {
310 text_add_str(yytext, yyleng);
311 dico_log(L_WARN, 0,
312 _("%u: unknown character sequence %s"),
313 token_beg, yytext);
314 }
315 }
316 \r ;
317 . text_add_str(yytext, yyleng);
318 \n text_add_str(yytext, yyleng);
319 }
320 <COMMENT>{
321 [^< \t][^ \t>]*">" {
322 if (strcmp(yytext, endtag) == 0)
323 BEGIN(retstate);
324 }
325 . ;
326 \n ;
327 }
328 %%
329
330 int
331 yywrap()
332 {
333 return 1;
334 }
335
336 struct walk_closure {
337 int (*fun)(int, struct gcide_tag *, void *);
338 void *data;
339 };
340
341 static int
inorder_helper(void * item,void * data)342 inorder_helper(void *item, void *data)
343 {
344 struct gcide_tag *tag = item;
345 struct walk_closure *cp = data;
346
347 if (cp->fun(0, tag, cp->data))
348 return 1;
349 if (tag->tag_type == gcide_content_taglist) {
350 dico_list_iterate(tag->tag_v.taglist, inorder_helper, data);
351 if (cp->fun(1, tag, cp->data))
352 return 1;
353 }
354 return 0;
355 }
356
357 int
gcide_parse_tree_inorder(struct gcide_parse_tree * tp,int (* fun)(int,struct gcide_tag *,void *),void * data)358 gcide_parse_tree_inorder(struct gcide_parse_tree *tp,
359 int (*fun)(int, struct gcide_tag *, void *),
360 void *data)
361 {
362 struct walk_closure clos;
363 clos.fun = fun;
364 clos.data = data;
365 return inorder_helper(tp->root, &clos);
366 }
367
368
369 static int
tag_fixup(void * item,void * data)370 tag_fixup(void *item, void *data)
371 {
372 struct gcide_tag *tag = item;
373 char *textspace = data;
374
375 switch (tag->tag_type) {
376 case gcide_content_unspecified:
377 break;
378 case gcide_content_text:
379 tag->tag_v.text = textspace + tag->tag_v.textpos;
380 break;
381 case gcide_content_taglist:
382 dico_list_iterate(tag->tag_v.taglist, tag_fixup, textspace);
383 break;
384 }
385 return 0;
386 }
387
388 static size_t
greek_translit(size_t n)389 greek_translit(size_t n)
390 {
391 while (textspace[n]) {
392 size_t rd;
393 const char *greek = gcide_grk_to_utf8(textspace + n, &rd);
394
395 if (greek) {
396 text_add_str(greek, strlen(greek));
397 n += rd;
398 } else {
399 text_add_chr(textspace[n]);
400 n++;
401 }
402 }
403 return text_segment_finish();
404 }
405
406 static int
early_fixup(void * item,void * data)407 early_fixup(void *item, void *data)
408 {
409 struct gcide_tag *tag = item;
410 int translate = data ? *(int*)data : 0;
411
412 switch (tag->tag_type) {
413 case gcide_content_unspecified:
414 break;
415 case gcide_content_text:
416 if (translate)
417 tag->tag_v.textpos = greek_translit(tag->tag_v.textpos);
418 break;
419 case gcide_content_taglist:
420 translate = (tag->tag_parmc && strcmp(tag->tag_name, "grk") == 0);
421 dico_list_iterate(tag->tag_v.taglist, early_fixup, &translate);
422 break;
423 }
424 return 0;
425 }
426
427 struct gcide_parse_tree *
gcide_markup_parse(char const * text,size_t len,int dbg)428 gcide_markup_parse(char const *text, size_t len, int dbg)
429 {
430 struct gcide_parse_tree *tp;
431 struct gcide_tag *p;
432
433 input_buf = text;
434 input_len = len;
435 token_beg = token_end = 0;
436
437 if (setjmp(errbuf)) {
438 free(textspace);
439 dico_list_destroy(&tagstk);
440 free_tag(current_tag, NULL);
441 }
442
443 textsize = 2 * len;
444 textspace = malloc(textsize);
445 if (!textspace)
446 memerr("gcide_markup_parse");
447 textpos = textstart = 0;
448
449 tagstk = dico_list_create();
450 if (!tagstk)
451 memerr("gcide_markup_parse");
452 dico_list_set_free_item(tagstk, free_tag, NULL);
453 current_tag = calloc(1, sizeof(*current_tag));
454 if (!current_tag)
455 memerr("gcide_markup_parse");
456
457 yy_flex_debug = dbg;
458 BEGIN(INITIAL);
459 while (yylex ())
460 ;
461
462 /* Append trailing text segment, if any */
463 if (text_segment_length()) {
464 struct gcide_tag *tag = calloc(1, sizeof(*tag));
465 if (!tag)
466 memerr("gcide lexer");
467 tag->tag_type = gcide_content_text;
468 tag->tag_v.textpos = text_segment_finish();
469 append_tag(tag);
470 }
471
472 while ((p = dico_list_pop(tagstk)))
473 /* FIXME: Report unclosed tag */
474 current_tag = p;
475
476 dico_list_destroy(&tagstk);
477
478 if (!current_tag) {
479 free(textspace);
480 return NULL;
481 }
482
483 early_fixup(current_tag, NULL);
484
485 tp = malloc(sizeof(*tp));
486 if (!tp)
487 memerr("gcide_markup_parse");
488
489 tp->textspace = textspace;
490 tp->textsize = textsize;
491
492 tp->root = current_tag;
493
494 tag_fixup(tp->root, textspace);
495
496 return tp;
497 }
498
499 void
gcide_parse_tree_free(struct gcide_parse_tree * tp)500 gcide_parse_tree_free(struct gcide_parse_tree *tp)
501 {
502 free(tp->textspace);
503 free_tag(tp->root, NULL);
504 free(tp);
505 }
506
507