1#!/usr/bin/perl
2# Copyright (C) 2016 and later: Unicode, Inc. and others.
3# License & terms of use: http://www.unicode.org/copyright.html
4#  ********************************************************************
5#  * COPYRIGHT:
6#  * Copyright (c) 2002-2016, International Business Machines Corporation and
7#  * others. All Rights Reserved.
8#  ********************************************************************
9#
10#  regexcst.pl
11#            Compile the regular expression paser state table data into initialized C data.
12#            Usage:
13#                   cd icu/source/i18n
14#                   perl regexcst.pl < regexcst.txt > regexcst.h
15#
16#             The output file, regexcst.h, is included by some of the .cpp regex
17#             implementation files.   This perl script is NOT run as part
18#             of a normal ICU build.  It is run by hand when needed, and the
19#             regexcst.h generated file is put back into cvs.
20#
21#             See regexcst.txt for a description of the input format for this script.
22#
23#             This script is derived from rbbicst.pl, which peforms the same function
24#             for the Rule Based Break Iterator Rule Parser.  Perhaps they could be
25#             merged?
26#
27
28
29$num_states = 1;         # Always the state number for the line being compiled.
30$line_num  = 0;          # The line number in the input file.
31
32$states{"pop"} = 255;    # Add the "pop"  to the list of defined state names.
33                         # This prevents any state from being labelled with "pop",
34                         #  and resolves references to "pop" in the next state field.
35
36line_loop: while (<>) {
37    chomp();
38    $line = $_;
39    @fields = split();
40    $line_num++;
41
42    # Remove # comments, which are any fields beginning with a #, plus all
43    #  that follow on the line.
44    for ($i=0; $i<@fields; $i++) {
45        if ($fields[$i] =~ /^#/) {
46            @fields = @fields[0 .. $i-1];
47            last;
48        }
49    }
50    # ignore blank lines, and those with no fields left after stripping comments..
51    if (@fields == 0) {
52        next;
53    }
54
55    #
56    # State Label:  handling.
57    #    Does the first token end with a ":"?  If so, it's the name  of a state.
58    #    Put in a hash, together with the current state number,
59    #        so that we can later look up the number from the name.
60    #
61    if (@fields[0] =~ /.*:$/) {
62        $state_name = @fields[0];
63        $state_name =~ s/://;        # strip off the colon from the state name.
64
65        if ($states{$state_name} != 0) {
66            print "  rbbicst: at line $line-num duplicate definition of state $state_name\n";
67        }
68        $states{$state_name} = $num_states;
69        $stateNames[$num_states] = $state_name;
70
71        # if the label was the only thing on this line, go on to the next line,
72        # otherwise assume that a state definition is on the same line and fall through.
73        if (@fields == 1) {
74            next line_loop;
75        }
76        shift @fields;                       # shift off label field in preparation
77                                             #  for handling the rest of the line.
78    }
79
80    #
81    # State Transition line.
82    #   syntax is this,
83    #       character   [n]  target-state  [^push-state]  [function-name]
84    #   where
85    #      [something]   is an optional something
86    #      character     is either a single quoted character e.g. '['
87    #                       or a name of a character class, e.g. white_space
88    #
89
90    $state_line_num[$num_states] = $line_num;   # remember line number with each state
91                                                #  so we can make better error messages later.
92    #
93    # First field, character class or literal character for this transition.
94    #
95    if ($fields[0] =~ /^'.'$/) {
96        # We've got a quoted literal character.
97        $state_literal_chars[$num_states] = $fields[0];
98        $state_literal_chars[$num_states] =~ s/'//g;
99    } else {
100        # We've got the name of a character class.
101        $state_char_class[$num_states] = $fields[0];
102        if ($fields[0] =~ /[\W]/) {
103            print "  rbbicsts:  at line $line_num, bad character literal or character class name.\n";
104            print "     scanning $fields[0]\n";
105            exit(-1);
106        }
107    }
108    shift @fields;
109
110    #
111    # do the 'n' flag
112    #
113    $state_flag[$num_states] = "FALSE";
114    if ($fields[0] eq "n") {
115        $state_flag[$num_states] = "TRUE";
116        shift @fields;
117    }
118
119    #
120    # do the destination state.
121    #
122    $state_dest_state[$num_states] = $fields[0];
123    if ($fields[0] eq "") {
124        print "  rbbicsts:  at line $line_num, destination state missing.\n";
125        exit(-1);
126    }
127    shift @fields;
128
129    #
130    # do the push state, if present.
131    #
132    if ($fields[0] =~ /^\^/) {
133        $fields[0] =~ s/^\^//;
134        $state_push_state[$num_states] = $fields[0];
135        if ($fields[0] eq "" ) {
136            print "  rbbicsts:  at line $line_num, expected state after ^ (no spaces).\n";
137            exit(-1);
138        }
139        shift @fields;
140    }
141
142    #
143    # Lastly, do the optional action name.
144    #
145    if ($fields[0] ne "") {
146        $state_func_name[$num_states] = $fields[0];
147        shift @fields;
148    }
149
150    #
151    #  There should be no fields left on the line at this point.
152    #
153    if (@fields > 0) {
154       print "  rbbicsts:  at line $line_num, unexpected extra stuff on input line.\n";
155       print "     scanning $fields[0]\n";
156   }
157   $num_states++;
158}
159
160#
161# We've read in the whole file, now go back and output the
162#   C source code for the state transition table.
163#
164# We read all states first, before writing anything,  so that the state numbers
165# for the destination states are all available to be written.
166#
167
168#
169# Make hashes for the names of the character classes and
170#      for the names of the actions that appeared.
171#
172for ($state=1; $state < $num_states; $state++) {
173    if ($state_char_class[$state] ne "") {
174        if ($charClasses{$state_char_class[$state]} == 0) {
175            $charClasses{$state_char_class[$state]} = 1;
176        }
177    }
178    if ($state_func_name[$state] eq "") {
179        $state_func_name[$state] = "doNOP";
180    }
181    if ($actions{$state_action_name[$state]} == 0) {
182        $actions{$state_func_name[$state]} = 1;
183    }
184}
185
186#
187# Check that all of the destination states have been defined
188#
189#
190$states{"exit"} = 0;              # Predefined state name, terminates state machine.
191for ($state=1; $state<$num_states; $state++) {
192   if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") {
193       print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n";
194       $errors++;
195   }
196   if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) {
197       print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n";
198       $errors++;
199   }
200}
201
202die if ($errors>0);
203
204print "//---------------------------------------------------------------------------------\n";
205print "//\n";
206print "// Generated Header File.  Do not edit by hand.\n";
207print "//    This file contains the state table for the ICU Regular Expression Pattern Parser\n";
208print "//    It is generated by the Perl script \"regexcst.pl\" from\n";
209print "//    the rule parser state definitions file \"regexcst.txt\".\n";
210print "//\n";
211print "//   Copyright (C) 2002-2016 International Business Machines Corporation \n";
212print "//   and others. All rights reserved.  \n";
213print "//\n";
214print "//---------------------------------------------------------------------------------\n";
215print "#ifndef RBBIRPT_H\n";
216print "#define RBBIRPT_H\n";
217print "\n";
218print "#include \"unicode/utypes.h\"\n";
219print "\n";
220print "U_NAMESPACE_BEGIN\n";
221
222#
223# Emit the constants for indicies of Unicode Sets
224#   Define one constant for each of the character classes encountered.
225#   At the same time, store the index corresponding to the set name back into hash.
226#
227print "//\n";
228print "// Character classes for regex pattern scanning.\n";
229print "//\n";
230$i = 128;                   # State Table values for Unicode char sets range from 128-250.
231                            # Sets "default", "quoted", etc. get special handling.
232                            #  They have no corresponding UnicodeSet object in the state machine,
233                            #    but are handled by special case code.  So we emit no reference
234                            #    to a UnicodeSet object to them here.
235foreach $setName (keys %charClasses) {
236    if ($setName eq "default") {
237        $charClasses{$setName} = 255;}
238    elsif ($setName eq "quoted") {
239        $charClasses{$setName} = 254;}
240    elsif ($setName eq "eof") {
241        $charClasses{$setName} = 253;}
242    else {
243        # Normal character class.  Fill in array with a ptr to the corresponding UnicodeSet in the state machine.
244       print "    static const uint8_t kRuleSet_$setName = $i;\n";
245        $charClasses{$setName} = $i;
246        $i++;
247    }
248}
249print "\n\n";
250
251#
252# Emit the enum for the actions to be performed.
253#
254print "enum Regex_PatternParseAction {\n";
255foreach $act (keys %actions) {
256    print "    $act,\n";
257}
258print "    rbbiLastAction};\n\n";
259
260#
261# Emit the struct definition for transtion table elements.
262#
263print "//-------------------------------------------------------------------------------\n";
264print "//\n";
265print "//  RegexTableEl       represents the structure of a row in the transition table\n";
266print "//                     for the pattern parser state machine.\n";
267print "//-------------------------------------------------------------------------------\n";
268print "struct RegexTableEl {\n";
269print "    Regex_PatternParseAction      fAction;\n";
270print "    uint8_t                       fCharClass;       // 0-127:    an individual ASCII character\n";
271print "                                                    // 128-255:  character class index\n";
272print "    uint8_t                       fNextState;       // 0-250:    normal next-state numbers\n";
273print "                                                    // 255:      pop next-state from stack.\n";
274print "    uint8_t                       fPushState;\n";
275print "    UBool                         fNextChar;\n";
276print "};\n\n";
277
278#
279# emit the state transition table
280#
281print "static const struct RegexTableEl gRuleParseStateTable[] = {\n";
282print "    {doNOP, 0, 0, 0, TRUE}\n";    # State 0 is a dummy.  Real states start with index = 1.
283for ($state=1; $state < $num_states; $state++) {
284    print "    , {$state_func_name[$state],";
285    if ($state_literal_chars[$state] ne "") {
286        $c = $state_literal_chars[$state];
287        printf(" %d /* $c */,", ord($c));   #  use numeric value, so EBCDIC machines are ok.
288    }else {
289        print " $charClasses{$state_char_class[$state]},";
290    }
291    print " $states{$state_dest_state[$state]},";
292
293    # The push-state field is optional.  If omitted, fill field with a zero, which flags
294    #   the state machine that there is no push state.
295    if ($state_push_state[$state] eq "") {
296        print "0, ";
297    } else {
298        print " $states{$state_push_state[$state]},";
299    }
300    print " $state_flag[$state]} ";
301
302    # Put out a C++ comment showing the number (index) of this state row,
303    #   and, if this is the first row of the table for this state, the state name.
304    print "    //  $state ";
305    if ($stateNames[$state] ne "") {
306        print "     $stateNames[$state]";
307    }
308    print "\n";
309};
310print " };\n";
311
312
313#
314# emit a mapping array from state numbers to state names.
315#
316#    This array is used for producing debugging output from the pattern parser.
317#
318print "static const char * const RegexStateNames[] = {";
319for ($state=0; $state<$num_states; $state++) {
320    if ($stateNames[$state] ne "") {
321        print "     \"$stateNames[$state]\",\n";
322    } else {
323        print "    0,\n";
324    }
325}
326print "    0};\n\n";
327
328print "U_NAMESPACE_END\n";
329print "#endif\n";
330
331
332
333