1#! /usr/bin/perl -w
2# -*- Perl -*-
3#
4# afblue.pl
5#
6# Process a blue zone character data file.
7#
8# Copyright (C) 2013-2019 by
9# David Turner, Robert Wilhelm, and Werner Lemberg.
10#
11# This file is part of the FreeType project, and may only be used,
12# modified, and distributed under the terms of the FreeType project
13# license, LICENSE.TXT.  By continuing to use, modify, or distribute
14# this file you indicate that you have read the license and
15# understand and accept it fully.
16
17use strict;
18use warnings;
19use English '-no_match_vars';
20use open ':std', ':encoding(UTF-8)';
21
22
23my $prog = $PROGRAM_NAME;
24$prog =~ s| .* / ||x;      # Remove path.
25
26die "usage: $prog datafile < infile > outfile\n" if $#ARGV != 0;
27
28
29my $datafile = $ARGV[0];
30
31my %diversions;        # The extracted and massaged data from `datafile'.
32my @else_stack;        # Booleans to track else-clauses.
33my @name_stack;        # Stack of integers used for names of aux. variables.
34
35my $curr_enum;         # Name of the current enumeration.
36my $curr_array;        # Name of the current array.
37my $curr_max;          # Name of the current maximum value.
38
39my $curr_enum_element; # Name of the current enumeration element.
40my $curr_offset;       # The offset relative to current aux. variable.
41my $curr_elem_size;    # The number of non-space characters in the current string or
42                       # the number of elements in the current block.
43
44my $have_sections = 0; # Boolean; set if start of a section has been seen.
45my $have_strings;      # Boolean; set if current section contains strings.
46my $have_blocks;       # Boolean; set if current section contains blocks.
47
48my $have_enum_element; # Boolean; set if we have an enumeration element.
49my $in_string;         # Boolean; set if a string has been parsed.
50
51my $num_sections = 0;  # Number of sections seen so far.
52
53my $last_aux;          # Name of last auxiliary variable.
54
55
56# Regular expressions.
57
58# [<ws>] <enum_name> <ws> <array_name> <ws> <max_name> [<ws>] ':' [<ws>] '\n'
59my $section_re = qr/ ^ \s* (\S+) \s+ (\S+) \s+ (\S+) \s* : \s* $ /x;
60
61# [<ws>] <enum_element_name> [<ws>] '\n'
62my $enum_element_re = qr/ ^ \s* ( [A-Za-z0-9_]+ ) \s* $ /x;
63
64# '#' <preprocessor directive> '\n'
65my $preprocessor_re = qr/ ^ \# /x;
66
67# [<ws>] '/' '/' <comment> '\n'
68my $comment_re = qr| ^ \s* // |x;
69
70# empty line
71my $whitespace_only_re = qr/ ^ \s* $ /x;
72
73# [<ws>] '"' <string> '"' [<ws>] '\n'  (<string> doesn't contain newlines)
74my $string_re = qr/ ^ \s*
75                       " ( (?> (?: (?> [^"\\]+ ) | \\. )* ) ) "
76                       \s* $ /x;
77
78# [<ws>] '{' <block> '}' [<ws>] '\n'  (<block> can contain newlines)
79my $block_start_re = qr/ ^ \s* \{ /x;
80
81# We need the capturing group for `split' to make it return the separator
82# tokens (i.e., the opening and closing brace) also.
83my $brace_re = qr/ ( [{}] ) /x;
84
85
86sub Warn
87{
88  my $message = shift;
89  warn "$datafile:$INPUT_LINE_NUMBER: warning: $message\n";
90}
91
92
93sub Die
94{
95  my $message = shift;
96  die "$datafile:$INPUT_LINE_NUMBER: error: $message\n";
97}
98
99
100my $warned_before = 0;
101
102sub warn_before
103{
104  Warn("data before first section gets ignored") unless $warned_before;
105  $warned_before = 1;
106}
107
108
109sub strip_newline
110{
111  chomp;
112  s/ \x0D $ //x;
113}
114
115
116sub end_curr_string
117{
118  # Append final null byte to string.
119  if ($have_strings)
120  {
121    push @{$diversions{$curr_array}}, "    '\\0',\n" if $in_string;
122
123    $curr_offset++;
124    $in_string = 0;
125  }
126}
127
128
129sub update_max_elem_size
130{
131  if ($curr_elem_size)
132  {
133    my $max = pop @{$diversions{$curr_max}};
134    $max = $curr_elem_size if $curr_elem_size > $max;
135    push @{$diversions{$curr_max}}, $max;
136  }
137}
138
139
140sub convert_non_ascii_char
141{
142  # A UTF-8 character outside of the printable ASCII range, with possibly a
143  # leading backslash character.
144  my $s = shift;
145
146  # Here we count characters, not bytes.
147  $curr_elem_size += length $s;
148
149  utf8::encode($s);
150  $s = uc unpack 'H*', $s;
151
152  $curr_offset += $s =~ s/\G(..)/'\\x$1', /sg;
153
154  return $s;
155}
156
157
158sub convert_ascii_chars
159{
160  # A series of ASCII characters in the printable range.
161  my $s = shift;
162
163  # We reduce multiple space characters to a single one.
164  $s =~ s/ +/ /g;
165
166  # Count all non-space characters.  Note that `()' applies a list context
167  # to the capture that is used to count the elements.
168  $curr_elem_size += () = $s =~ /[^ ]/g;
169
170  $curr_offset += $s =~ s/\G(.)/'$1', /g;
171
172  return $s;
173}
174
175
176sub convert_literal
177{
178  my $s = shift;
179  my $orig = $s;
180
181  # ASCII printables and space
182  my $safe_re = '\x20-\x7E';
183  # ASCII printables and space, no backslash
184  my $safe_no_backslash_re = '\x20-\x5B\x5D-\x7E';
185
186  $s =~ s{
187           (?: \\? ( [^$safe_re] )
188               | ( (?: [$safe_no_backslash_re]
189                       | \\ [$safe_re] )+ ) )
190         }
191         {
192           defined($1) ? convert_non_ascii_char($1)
193                       : convert_ascii_chars($2)
194         }egx;
195
196   # We assume that `$orig' doesn't contain `*/'
197   return $s . " /* $orig */";
198}
199
200
201sub aux_name
202{
203  return "af_blue_" . $num_sections. "_" . join('_', @name_stack);
204}
205
206
207sub aux_name_next
208{
209  $name_stack[$#name_stack]++;
210  my $name = aux_name();
211  $name_stack[$#name_stack]--;
212
213  return $name;
214}
215
216
217sub enum_val_string
218{
219  # Build string that holds code to save the current offset in an
220  # enumeration element.
221  my $aux = shift;
222
223  my $add = ($last_aux eq "af_blue_" . $num_sections . "_0" )
224              ? ""
225              : "$last_aux + ";
226
227  return "    $aux = $add$curr_offset,\n";
228}
229
230
231
232# Process data file.
233
234open(DATA, $datafile) || die "$prog: can't open \`$datafile': $OS_ERROR\n";
235
236while (<DATA>)
237{
238  strip_newline();
239
240  next if /$comment_re/;
241  next if /$whitespace_only_re/;
242
243  if (/$section_re/)
244  {
245    Warn("previous section is empty") if ($have_sections
246                                          && !$have_strings
247                                          && !$have_blocks);
248
249    end_curr_string();
250    update_max_elem_size();
251
252    # Save captured groups from `section_re'.
253    $curr_enum = $1;
254    $curr_array = $2;
255    $curr_max = $3;
256
257    $curr_enum_element = "";
258    $curr_offset = 0;
259
260    Warn("overwriting already defined enumeration \`$curr_enum'")
261      if exists($diversions{$curr_enum});
262    Warn("overwriting already defined array \`$curr_array'")
263      if exists($diversions{$curr_array});
264    Warn("overwriting already defined maximum value \`$curr_max'")
265      if exists($diversions{$curr_max});
266
267    $diversions{$curr_enum} = [];
268    $diversions{$curr_array} = [];
269    $diversions{$curr_max} = [];
270
271    push @{$diversions{$curr_max}}, 0;
272
273    @name_stack = ();
274    push @name_stack, 0;
275
276    $have_sections = 1;
277    $have_strings = 0;
278    $have_blocks = 0;
279
280    $have_enum_element = 0;
281    $in_string = 0;
282
283    $num_sections++;
284    $curr_elem_size = 0;
285
286    $last_aux = aux_name();
287
288    next;
289  }
290
291  if (/$preprocessor_re/)
292  {
293    if ($have_sections)
294    {
295      # Having preprocessor conditionals complicates the computation of
296      # correct offset values.  We have to introduce auxiliary enumeration
297      # elements with the name `af_blue_<s>_<n1>_<n2>_...' that store
298      # offsets to be used in conditional clauses.  `<s>' is the number of
299      # sections seen so far, `<n1>' is the number of `#if' and `#endif'
300      # conditionals seen so far in the topmost level, `<n2>' the number of
301      # `#if' and `#endif' conditionals seen so far one level deeper, etc.
302      # As a consequence, uneven values are used within a clause, and even
303      # values after a clause, since the C standard doesn't allow the
304      # redefinition of an enumeration value.  For example, the name
305      # `af_blue_5_1_6' is used to construct enumeration values in the fifth
306      # section after the third (second-level) if-clause within the first
307      # (top-level) if-clause.  After the first top-level clause has
308      # finished, `af_blue_5_2' is used.  The current offset is then
309      # relative to the value stored in the current auxiliary element.
310
311      if (/ ^ \# \s* if /x)
312      {
313        push @else_stack, 0;
314
315        $name_stack[$#name_stack]++;
316
317        push @{$diversions{$curr_enum}}, enum_val_string(aux_name());
318        $last_aux = aux_name();
319
320        push @name_stack, 0;
321
322        $curr_offset = 0;
323      }
324      elsif (/ ^ \# \s* elif /x)
325      {
326        Die("unbalanced #elif") unless @else_stack;
327
328        pop @name_stack;
329
330        push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next());
331        $last_aux = aux_name();
332
333        push @name_stack, 0;
334
335        $curr_offset = 0;
336      }
337      elsif (/ ^ \# \s* else /x)
338      {
339        my $prev_else = pop @else_stack;
340        Die("unbalanced #else") unless defined($prev_else);
341        Die("#else already seen") if $prev_else;
342        push @else_stack, 1;
343
344        pop @name_stack;
345
346        push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next());
347        $last_aux = aux_name();
348
349        push @name_stack, 0;
350
351        $curr_offset = 0;
352      }
353      elsif (/ ^ (\# \s*) endif /x)
354      {
355        my $prev_else = pop @else_stack;
356        Die("unbalanced #endif") unless defined($prev_else);
357
358        pop @name_stack;
359
360        # If there is no else-clause for an if-clause, we add one.  This is
361        # necessary to have correct offsets.
362        if (!$prev_else)
363        {
364          # Use amount of whitespace from `endif'.
365          push @{$diversions{$curr_enum}}, enum_val_string(aux_name_next())
366                                           . $1 . "else\n";
367          $last_aux = aux_name();
368
369          $curr_offset = 0;
370        }
371
372        $name_stack[$#name_stack]++;
373
374        push @{$diversions{$curr_enum}}, enum_val_string(aux_name());
375        $last_aux = aux_name();
376
377        $curr_offset = 0;
378      }
379
380      # Handle (probably continued) preprocessor lines.
381    CONTINUED_LOOP:
382      {
383        do
384        {
385          strip_newline();
386
387          push @{$diversions{$curr_enum}}, $ARG . "\n";
388          push @{$diversions{$curr_array}}, $ARG . "\n";
389
390          last CONTINUED_LOOP unless / \\ $ /x;
391
392        } while (<DATA>);
393      }
394    }
395    else
396    {
397      warn_before();
398    }
399
400    next;
401  }
402
403  if (/$enum_element_re/)
404  {
405    end_curr_string();
406    update_max_elem_size();
407
408    $curr_enum_element = $1;
409    $have_enum_element = 1;
410    $curr_elem_size = 0;
411
412    next;
413  }
414
415  if (/$string_re/)
416  {
417    if ($have_sections)
418    {
419      Die("strings and blocks can't be mixed in a section") if $have_blocks;
420
421      # Save captured group from `string_re'.
422      my $string = $1;
423
424      if ($have_enum_element)
425      {
426        push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element);
427        $have_enum_element = 0;
428      }
429
430      $string = convert_literal($string);
431
432      push @{$diversions{$curr_array}}, "    $string\n";
433
434      $have_strings = 1;
435      $in_string = 1;
436    }
437    else
438    {
439      warn_before();
440    }
441
442    next;
443  }
444
445  if (/$block_start_re/)
446  {
447    if ($have_sections)
448    {
449      Die("strings and blocks can't be mixed in a section") if $have_strings;
450
451      my $depth = 0;
452      my $block = "";
453      my $block_end = 0;
454
455      # Count braces while getting the block.
456    BRACE_LOOP:
457      {
458        do
459        {
460          strip_newline();
461
462          foreach my $substring (split(/$brace_re/))
463          {
464            if ($block_end)
465            {
466              Die("invalid data after last matching closing brace")
467                if $substring !~ /$whitespace_only_re/;
468            }
469
470            $block .= $substring;
471
472            if ($substring eq '{')
473            {
474              $depth++;
475            }
476            elsif ($substring eq '}')
477            {
478              $depth--;
479
480              $block_end = 1 if $depth == 0;
481            }
482          }
483
484          # If we are here, we have run out of substrings, so get next line
485          # or exit.
486          last BRACE_LOOP if $block_end;
487
488          $block .= "\n";
489
490        } while (<DATA>);
491      }
492
493      if ($have_enum_element)
494      {
495        push @{$diversions{$curr_enum}}, enum_val_string($curr_enum_element);
496        $have_enum_element = 0;
497      }
498
499      push @{$diversions{$curr_array}}, $block . ",\n";
500
501      $curr_offset++;
502      $curr_elem_size++;
503
504      $have_blocks = 1;
505    }
506    else
507    {
508      warn_before();
509    }
510
511    next;
512  }
513
514  # Garbage.  We weren't able to parse the data.
515  Die("syntax error");
516}
517
518# Finalize data.
519end_curr_string();
520update_max_elem_size();
521
522
523# Filter stdin to stdout, replacing `@...@' templates.
524
525sub emit_diversion
526{
527  my $diversion_name = shift;
528  return (exists($diversions{$1})) ? "@{$diversions{$1}}"
529                                   : "@" . $diversion_name . "@";
530}
531
532
533$LIST_SEPARATOR = '';
534
535my $s1 = "This file has been generated by the Perl script \`$prog',";
536my $s1len = length $s1;
537my $s2 = "using data from file \`$datafile'.";
538my $s2len = length $s2;
539my $slen = ($s1len > $s2len) ? $s1len : $s2len;
540
541print "/* " . $s1 . " " x ($slen - $s1len) . " */\n"
542      . "/* " . $s2 . " " x ($slen - $s2len) . " */\n"
543      . "\n";
544
545while (<STDIN>)
546{
547  s/ @ ( [A-Za-z0-9_]+? ) @ / emit_diversion($1) /egx;
548  print;
549}
550
551# EOF
552