1 /* xgettext Smalltalk backend.
2    Copyright (C) 2002-2003, 2005 Free Software Foundation, Inc.
3 
4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19 
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 
24 #include <errno.h>
25 #include <stdio.h>
26 #include <stdlib.h>
27 
28 #include "message.h"
29 #include "xgettext.h"
30 #include "x-smalltalk.h"
31 #include "error.h"
32 #include "xalloc.h"
33 #include "exit.h"
34 #include "gettext.h"
35 
36 #define _(s) gettext(s)
37 
38 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
39 
40 
41 /* The relevant parts of the Smalltalk syntax are:
42 
43      stringliteral ::= string | stringconst | symconst
44      stringconst ::= "#"string
45      string      ::= "'"[char]*"'"
46      symconst    ::= "#"symbol
47      symbol      ::= id | binsel | keysel[keysel]*
48      keysel      ::= id":"
49      id          ::= letter[letter|digit]*
50      letter      ::= "A".."Z" | "a".."z"
51      digit       ::= "0".."9"
52      binsel      ::= selchar[selchar]
53      selchar     ::= "+" | "-" | "*" | "/" | "~" | "|" | "," | "<" | ">"
54                      | "=" | "&" | "@" | "?" | "%" | "\"
55 
56    Strings can contain any characters; to include the string delimiter itself,
57    it must be duplicated.
58 
59    Character constants are written  "$"char
60 
61    Comments are enclosed within double quotes.
62 
63    In well-formed expressions, {} and [] and () are balanced.
64  */
65 
66 
67 /* ======================== Reading of characters.  ======================== */
68 
69 
70 /* Real filename, used in error messages about the input file.  */
71 static const char *real_file_name;
72 
73 /* Logical filename and line number, used to label the extracted messages.  */
74 static char *logical_file_name;
75 static int line_number;
76 
77 /* The input file stream.  */
78 static FILE *fp;
79 
80 
81 /* 1. line_number handling.  */
82 
83 static int
phase1_getc()84 phase1_getc ()
85 {
86   int c = getc (fp);
87 
88   if (c == EOF)
89     {
90       if (ferror (fp))
91 	error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
92 	       real_file_name);
93       return EOF;
94     }
95 
96   if (c == '\n')
97     line_number++;
98 
99   return c;
100 }
101 
102 /* Supports only one pushback character.  */
103 static void
phase1_ungetc(int c)104 phase1_ungetc (int c)
105 {
106   if (c != EOF)
107     {
108       if (c == '\n')
109 	--line_number;
110 
111       ungetc (c, fp);
112     }
113 }
114 
115 
116 /* Accumulating comments.  */
117 
118 static char *buffer;
119 static size_t bufmax;
120 static size_t buflen;
121 
122 static inline void
comment_start()123 comment_start ()
124 {
125   buflen = 0;
126 }
127 
128 static inline void
comment_add(int c)129 comment_add (int c)
130 {
131   if (buflen >= bufmax)
132     {
133       bufmax = 2 * bufmax + 10;
134       buffer = xrealloc (buffer, bufmax);
135     }
136   buffer[buflen++] = c;
137 }
138 
139 static inline void
comment_line_end()140 comment_line_end ()
141 {
142   while (buflen >= 1
143 	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
144     --buflen;
145   if (buflen >= bufmax)
146     {
147       bufmax = 2 * bufmax + 10;
148       buffer = xrealloc (buffer, bufmax);
149     }
150   buffer[buflen] = '\0';
151   savable_comment_add (buffer);
152 }
153 
154 
155 /* These are for tracking whether comments count as immediately before
156    keyword.  */
157 static int last_comment_line;
158 static int last_non_comment_line;
159 
160 
161 /* ========================== Reading of tokens.  ========================== */
162 
163 
164 enum token_type_ty
165 {
166   token_type_eof,
167   token_type_uniq,		/* # */
168   token_type_symbol,		/* symbol */
169   token_type_string_literal,	/* string, stringconst, symbolconst */
170   token_type_other		/* misc. operator */
171 };
172 typedef enum token_type_ty token_type_ty;
173 
174 typedef struct token_ty token_ty;
175 struct token_ty
176 {
177   token_type_ty type;
178   char *string;		/* for token_type_string_literal, token_type_symbol */
179   int line_number;
180 };
181 
182 
183 /* 2. Combine characters into tokens.  Discard comments and whitespace.  */
184 
185 static token_ty phase2_pushback[1];
186 static int phase2_pushback_length;
187 
188 static void
phase2_get(token_ty * tp)189 phase2_get (token_ty *tp)
190 {
191   static char *buffer;
192   static int bufmax;
193   int bufpos;
194   int c;
195 
196   if (phase2_pushback_length)
197     {
198       *tp = phase2_pushback[--phase2_pushback_length];
199       return;
200     }
201 
202   tp->string = NULL;
203 
204   for (;;)
205     {
206       tp->line_number = line_number;
207       c = phase1_getc ();
208       switch (c)
209 	{
210 	case EOF:
211 	  tp->type = token_type_eof;
212 	  return;
213 
214 	case '"':
215 	  {
216 	    /* Comment.  */
217 	    int lineno;
218 
219 	    comment_start ();
220 	    lineno = line_number;
221 	    for (;;)
222 	      {
223 		c = phase1_getc ();
224 		if (c == '"' || c == EOF)
225 		  break;
226 		if (c == '\n')
227 		  {
228 		    comment_line_end ();
229 		    comment_start ();
230 		  }
231 		else
232 		  {
233 		    /* We skip all leading white space, but not EOLs.  */
234 		    if (!(buflen == 0 && (c == ' ' || c == '\t')))
235 		      comment_add (c);
236 		  }
237 	      }
238 	    comment_line_end ();
239 	    last_comment_line = lineno;
240 	    continue;
241 	  }
242 
243 	case '\n':
244 	  if (last_non_comment_line > last_comment_line)
245 	    savable_comment_reset ();
246 	  /* FALLTHROUGH */
247 	case ' ':
248 	case '\t':
249 	case '\r':
250 	  /* Ignore whitespace.  */
251 	  continue;
252 	}
253 
254       last_non_comment_line = tp->line_number;
255 
256       switch (c)
257 	{
258 	case '\'':
259 	  /* String literal.  */
260 	  bufpos = 0;
261 	  for (;;)
262 	    {
263 	      c = phase1_getc ();
264 	      if (c == EOF)
265 		break;
266 	      if (c == '\'')
267 		{
268 		  c = phase1_getc ();
269 		  if (c != '\'')
270 		    {
271 		      phase1_ungetc (c);
272 		      break;
273 		    }
274 		}
275 	      if (bufpos >= bufmax)
276 		{
277 		  bufmax = 2 * bufmax + 10;
278 		  buffer = xrealloc (buffer, bufmax);
279 		}
280 	      buffer[bufpos++] = c;
281 	    }
282 	  if (bufpos >= bufmax)
283 	    {
284 	      bufmax = 2 * bufmax + 10;
285 	      buffer = xrealloc (buffer, bufmax);
286 	    }
287 	  buffer[bufpos] = 0;
288 	  tp->type = token_type_string_literal;
289 	  tp->string = xstrdup (buffer);
290 	  return;
291 
292 	case '+':
293 	case '-':
294 	case '*':
295 	case '/':
296 	case '~':
297 	case '|':
298 	case ',':
299 	case '<':
300 	case '>':
301 	case '=':
302 	case '&':
303 	case '@':
304 	case '?':
305 	case '%':
306 	case '\\':
307 	  {
308 	    char *name;
309 	    int c2 = phase1_getc ();
310 	    switch (c2)
311 	      {
312 	      case '+':
313 	      case '-':
314 	      case '*':
315 	      case '/':
316 	      case '~':
317 	      case '|':
318 	      case ',':
319 	      case '<':
320 	      case '>':
321 	      case '=':
322 	      case '&':
323 	      case '@':
324 	      case '?':
325 	      case '%':
326 		name = xmalloc (3);
327 		name[0] = c;
328 		name[1] = c2;
329 		name[2] = '\0';
330 		tp->type = token_type_symbol;
331 		tp->string = name;
332 		return;
333 	      default:
334 		phase1_ungetc (c2);
335 		break;
336 	      }
337 	    name = xmalloc (2);
338 	    name[0] = c;
339 	    name[1] = '\0';
340 	    tp->type = token_type_symbol;
341 	    tp->string = name;
342 	    return;
343 	  }
344 
345 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
346 	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
347 	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
348 	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
349 	case 'Y': case 'Z':
350 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
351 	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
352 	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
353 	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
354 	case 'y': case 'z':
355 	  /* Recognize id or id":"[id":"]* or id":"[id":"]*id.  */
356 	  bufpos = 0;
357 	  for (;;)
358 	    {
359 	      if (bufpos >= bufmax)
360 		{
361 		  bufmax = 2 * bufmax + 10;
362 		  buffer = xrealloc (buffer, bufmax);
363 		}
364 	      buffer[bufpos++] = c;
365 	      c = phase1_getc ();
366 	      switch (c)
367 		{
368 		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
369 		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
370 		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
371 		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
372 		case 'Y': case 'Z':
373 		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
374 		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
375 		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
376 		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
377 		case 'y': case 'z':
378 		case '0': case '1': case '2': case '3': case '4':
379 		case '5': case '6': case '7': case '8': case '9':
380 		  continue;
381 		case ':':
382 		  if (bufpos >= bufmax)
383 		    {
384 		      bufmax = 2 * bufmax + 10;
385 		      buffer = xrealloc (buffer, bufmax);
386 		    }
387 		  buffer[bufpos++] = c;
388 		  c = phase1_getc ();
389 		  switch (c)
390 		    {
391 		    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
392 		    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
393 		    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
394 		    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
395 		    case 'Y': case 'Z':
396 		    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
397 		    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
398 		    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
399 		    case 's': case 't': case 'u': case 'v': case 'w': case 'x':
400 		    case 'y': case 'z':
401 		      continue;
402 		    default:
403 		      phase1_ungetc (c);
404 		      break;
405 		    }
406 		  break;
407 		default:
408 		  phase1_ungetc (c);
409 		  break;
410 		}
411 	      break;
412 	    }
413 	  if (bufpos >= bufmax)
414 	    {
415 	      bufmax = 2 * bufmax + 10;
416 	      buffer = xrealloc (buffer, bufmax);
417 	    }
418 	  buffer[bufpos] = '\0';
419 	  tp->string = xstrdup (buffer);
420 	  tp->type = token_type_symbol;
421 	  return;
422 
423 	case '#':
424 	  /* Uniquification operator.  */
425 	  tp->type = token_type_uniq;
426 	  return;
427 
428 	case '$':
429 	  c = phase1_getc ();
430 	  tp->type = token_type_other;
431 	  return;
432 
433 	default:
434 	  tp->type = token_type_other;
435 	  return;
436 	}
437     }
438 }
439 
440 /* Supports only one pushback token.  */
441 static void
phase2_unget(token_ty * tp)442 phase2_unget (token_ty *tp)
443 {
444   if (tp->type != token_type_eof)
445     {
446       if (phase2_pushback_length == SIZEOF (phase2_pushback))
447 	abort ();
448       phase2_pushback[phase2_pushback_length++] = *tp;
449     }
450 }
451 
452 
453 /* 3. Combine "# string_literal" and "# symbol" to a single token.  */
454 
455 static void
x_smalltalk_lex(token_ty * tp)456 x_smalltalk_lex (token_ty *tp)
457 {
458   phase2_get (tp);
459   if (tp->type == token_type_uniq)
460     {
461       token_ty token2;
462 
463       phase2_get (&token2);
464       if (token2.type == token_type_symbol
465 	  || token2.type == token_type_string_literal)
466 	{
467 	  tp->type = token_type_string_literal;
468 	  tp->string = token2.string;
469 	}
470       else
471 	phase2_unget (&token2);
472     }
473 }
474 
475 
476 /* ========================= Extracting strings.  ========================== */
477 
478 /* The file is broken into tokens.  Scan the token stream, looking for the
479    following patterns
480       NLS ? <string>
481       NLS at: <string>
482       NLS at: <string> plural: <string>
483    where <string> is one of
484       string_literal
485       # string_literal
486       # symbol
487  */
488 
489 void
extract_smalltalk(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)490 extract_smalltalk (FILE *f,
491 		   const char *real_filename, const char *logical_filename,
492 		   flag_context_list_table_ty *flag_table,
493 		   msgdomain_list_ty *mdlp)
494 {
495   message_list_ty *mlp = mdlp->item[0]->messages;
496 
497   fp = f;
498   real_file_name = real_filename;
499   logical_file_name = xstrdup (logical_filename);
500   line_number = 1;
501 
502   last_comment_line = -1;
503   last_non_comment_line = -1;
504 
505   /* Eat tokens until eof is seen.  */
506   {
507     /* 0 when no "NLS" has been seen.
508        1 after "NLS".
509        2 after "NLS ?".
510        3 after "NLS at:".
511        4 after "NLS at: <string>".
512        5 after "NLS at: <string> plural:".  */
513     int state;
514     /* Remember the message containing the msgid, for msgid_plural.
515        Non-NULL in states 4, 5.  */
516     message_ty *plural_mp = NULL;
517 
518     /* Start state is 0.  */
519     state = 0;
520 
521     for (;;)
522       {
523 	token_ty token;
524 
525 	x_smalltalk_lex (&token);
526 
527 	switch (token.type)
528 	  {
529 	  case token_type_symbol:
530 	    state = (strcmp (token.string, "NLS") == 0 ? 1 :
531 		     strcmp (token.string, "?") == 0 && state == 1 ? 2 :
532 		     strcmp (token.string, "at:") == 0 && state == 1 ? 3 :
533 		     strcmp (token.string, "plural:") == 0 && state == 4 ? 5 :
534 		     0);
535 	    free (token.string);
536 	    break;
537 
538 	  case token_type_string_literal:
539 	    if (state == 2)
540 	      {
541 		lex_pos_ty pos;
542 		pos.file_name = logical_file_name;
543 		pos.line_number = token.line_number;
544 		remember_a_message (mlp, NULL, token.string, null_context,
545 				    &pos, savable_comment);
546 		state = 0;
547 		break;
548 	      }
549 	    if (state == 3)
550 	      {
551 		lex_pos_ty pos;
552 		pos.file_name = logical_file_name;
553 		pos.line_number = token.line_number;
554 		plural_mp = remember_a_message (mlp, NULL, token.string,
555 						null_context, &pos,
556 						savable_comment);
557 		state = 4;
558 		break;
559 	      }
560 	    if (state == 5)
561 	      {
562 		lex_pos_ty pos;
563 		pos.file_name = logical_file_name;
564 		pos.line_number = token.line_number;
565 		remember_a_message_plural (plural_mp, token.string,
566 					   null_context, &pos,
567 					   savable_comment);
568 		state = 0;
569 		break;
570 	      }
571 	    state = 0;
572 	    free (token.string);
573 	    break;
574 
575 	  case token_type_uniq:
576 	  case token_type_other:
577 	    state = 0;
578 	    break;
579 
580 	  case token_type_eof:
581 	    break;
582 
583 	  default:
584 	    abort ();
585 	  }
586 
587 	if (token.type == token_type_eof)
588 	  break;
589       }
590   }
591 
592   /* Close scanner.  */
593   fp = NULL;
594   real_file_name = NULL;
595   logical_file_name = NULL;
596   line_number = 0;
597 }
598