1#!/usr/bin/perl
2# Copyright (C) 2016 and later: Unicode, Inc. and others.
3# License & terms of use: http://www.unicode.org/copyright.html
4#  ********************************************************************
5#  * COPYRIGHT:
6#  * Copyright (c) 2002-2016, International Business Machines Corporation and
7#  * others. All Rights Reserved.
8#  ********************************************************************
9#
10#  regexcst.pl
11#            Compile the regular expression paser state table data into initialized C data.
12#            Usage:
13#                   cd icu4c/source/i18n
14#                   perl regexcst.pl < regexcst.txt > regexcst.h
15#
16#             The output file, regexcst.h, is included by some of the .cpp regex
17#             implementation files.   This perl script is NOT run as part
18#             of a normal ICU build.  It is run by hand when needed, and the
19#             regexcst.h generated file is put back into the source code repository.
20#
21#             See regexcst.txt for a description of the input format for this script.
22#
23#             This script is derived from rbbicst.pl, which peforms the same function
24#             for the Rule Based Break Iterator Rule Parser.  Perhaps they could be
25#             merged?
26#
27
28
29$num_states = 1;         # Always the state number for the line being compiled.
30$line_num  = 0;          # The line number in the input file.
31
32$states{"pop"} = 255;    # Add the "pop"  to the list of defined state names.
33                         # This prevents any state from being labelled with "pop",
34                         #  and resolves references to "pop" in the next state field.
35
36line_loop: while (<>) {
37    chomp();
38    $line = $_;
39    @fields = split();
40    $line_num++;
41
42    # Remove # comments, which are any fields beginning with a #, plus all
43    #  that follow on the line.
44    for ($i=0; $i<@fields; $i++) {
45        if ($fields[$i] =~ /^#/) {
46            @fields = @fields[0 .. $i-1];
47            last;
48        }
49    }
50    # ignore blank lines, and those with no fields left after stripping comments..
51    if (@fields == 0) {
52        next;
53    }
54
55    #
56    # State Label:  handling.
57    #    Does the first token end with a ":"?  If so, it's the name  of a state.
58    #    Put in a hash, together with the current state number,
59    #        so that we can later look up the number from the name.
60    #
61    if (@fields[0] =~ /.*:$/) {
62        $state_name = @fields[0];
63        $state_name =~ s/://;        # strip off the colon from the state name.
64
65        if ($states{$state_name} != 0) {
66            print "  rbbicst: at line $line-num duplicate definition of state $state_name\n";
67        }
68        $states{$state_name} = $num_states;
69        $stateNames[$num_states] = $state_name;
70
71        # if the label was the only thing on this line, go on to the next line,
72        # otherwise assume that a state definition is on the same line and fall through.
73        if (@fields == 1) {
74            next line_loop;
75        }
76        shift @fields;                       # shift off label field in preparation
77                                             #  for handling the rest of the line.
78    }
79
80    #
81    # State Transition line.
82    #   syntax is this,
83    #       character   [n]  target-state  [^push-state]  [function-name]
84    #   where
85    #      [something]   is an optional something
86    #      character     is either a single quoted character e.g. '['
87    #                       or a name of a character class, e.g. white_space
88    #
89
90    $state_line_num[$num_states] = $line_num;   # remember line number with each state
91                                                #  so we can make better error messages later.
92    #
93    # First field, character class or literal character for this transition.
94    #
95    if ($fields[0] =~ /^'.'$/) {
96        # We've got a quoted literal character.
97        $state_literal_chars[$num_states] = $fields[0];
98        $state_literal_chars[$num_states] =~ s/'//g;
99    } else {
100        # We've got the name of a character class.
101        $state_char_class[$num_states] = $fields[0];
102        if ($fields[0] =~ /[\W]/) {
103            print "  rbbicsts:  at line $line_num, bad character literal or character class name.\n";
104            print "     scanning $fields[0]\n";
105            exit(-1);
106        }
107    }
108    shift @fields;
109
110    #
111    # do the 'n' flag
112    #
113    $state_flag[$num_states] = "FALSE";
114    if ($fields[0] eq "n") {
115        $state_flag[$num_states] = "TRUE";
116        shift @fields;
117    }
118
119    #
120    # do the destination state.
121    #
122    $state_dest_state[$num_states] = $fields[0];
123    if ($fields[0] eq "") {
124        print "  rbbicsts:  at line $line_num, destination state missing.\n";
125        exit(-1);
126    }
127    shift @fields;
128
129    #
130    # do the push state, if present.
131    #
132    if ($fields[0] =~ /^\^/) {
133        $fields[0] =~ s/^\^//;
134        $state_push_state[$num_states] = $fields[0];
135        if ($fields[0] eq "" ) {
136            print "  rbbicsts:  at line $line_num, expected state after ^ (no spaces).\n";
137            exit(-1);
138        }
139        shift @fields;
140    }
141
142    #
143    # Lastly, do the optional action name.
144    #
145    if ($fields[0] ne "") {
146        $state_func_name[$num_states] = $fields[0];
147        shift @fields;
148    }
149
150    #
151    #  There should be no fields left on the line at this point.
152    #
153    if (@fields > 0) {
154       print "  rbbicsts:  at line $line_num, unexpected extra stuff on input line.\n";
155       print "     scanning $fields[0]\n";
156   }
157   $num_states++;
158}
159
160#
161# We've read in the whole file, now go back and output the
162#   C source code for the state transition table.
163#
164# We read all states first, before writing anything,  so that the state numbers
165# for the destination states are all available to be written.
166#
167
168#
169# Make hashes for the names of the character classes and
170#      for the names of the actions that appeared.
171#
172for ($state=1; $state < $num_states; $state++) {
173    if ($state_char_class[$state] ne "") {
174        if ($charClasses{$state_char_class[$state]} == 0) {
175            $charClasses{$state_char_class[$state]} = 1;
176        }
177    }
178    if ($state_func_name[$state] eq "") {
179        $state_func_name[$state] = "doNOP";
180    }
181    if ($actions{$state_action_name[$state]} == 0) {
182        $actions{$state_func_name[$state]} = 1;
183    }
184}
185
186#
187# Check that all of the destination states have been defined
188#
189#
190$states{"exit"} = 0;              # Predefined state name, terminates state machine.
191for ($state=1; $state<$num_states; $state++) {
192   if ($states{$state_dest_state[$state]} == 0 && $state_dest_state[$state] ne "exit") {
193       print "Error at line $state_line_num[$state]: target state \"$state_dest_state[$state]\" is not defined.\n";
194       $errors++;
195   }
196   if ($state_push_state[$state] ne "" && $states{$state_push_state[$state]} == 0) {
197       print "Error at line $state_line_num[$state]: target state \"$state_push_state[$state]\" is not defined.\n";
198       $errors++;
199   }
200}
201
202die if ($errors>0);
203
204print "// © 2016 and later: Unicode, Inc. and others.\n";
205print "// License & terms of use: http://www.unicode.org/copyright.html\n";
206print "//---------------------------------------------------------------------------------\n";
207print "//\n";
208print "// Generated Header File.  Do not edit by hand.\n";
209print "//    This file contains the state table for the ICU Regular Expression Pattern Parser\n";
210print "//    It is generated by the Perl script \"regexcst.pl\" from\n";
211print "//    the rule parser state definitions file \"regexcst.txt\".\n";
212print "//\n";
213print "//   Copyright (C) 2002-2016 International Business Machines Corporation \n";
214print "//   and others. All rights reserved.  \n";
215print "//\n";
216print "//---------------------------------------------------------------------------------\n";
217print "#ifndef RBBIRPT_H\n";
218print "#define RBBIRPT_H\n";
219print "\n";
220print "#include \"unicode/utypes.h\"\n";
221print "\n";
222print "U_NAMESPACE_BEGIN\n";
223
224#
225# Emit the constants for indicies of Unicode Sets
226#   Define one constant for each of the character classes encountered.
227#   At the same time, store the index corresponding to the set name back into hash.
228#
229print "//\n";
230print "// Character classes for regex pattern scanning.\n";
231print "//\n";
232$i = 128;                   # State Table values for Unicode char sets range from 128-250.
233                            # Sets "default", "quoted", etc. get special handling.
234                            #  They have no corresponding UnicodeSet object in the state machine,
235                            #    but are handled by special case code.  So we emit no reference
236                            #    to a UnicodeSet object to them here.
237foreach $setName (keys %charClasses) {
238    if ($setName eq "default") {
239        $charClasses{$setName} = 255;}
240    elsif ($setName eq "quoted") {
241        $charClasses{$setName} = 254;}
242    elsif ($setName eq "eof") {
243        $charClasses{$setName} = 253;}
244    else {
245        # Normal character class.  Fill in array with a ptr to the corresponding UnicodeSet in the state machine.
246       print "    static const uint8_t kRuleSet_$setName = $i;\n";
247        $charClasses{$setName} = $i;
248        $i++;
249    }
250}
251print "    constexpr uint32_t kRuleSet_count = $i-128;";
252print "\n\n";
253
254#
255# Emit the enum for the actions to be performed.
256#
257print "enum Regex_PatternParseAction {\n";
258foreach $act (keys %actions) {
259    print "    $act,\n";
260}
261print "    rbbiLastAction};\n\n";
262
263#
264# Emit the struct definition for transtion table elements.
265#
266print "//-------------------------------------------------------------------------------\n";
267print "//\n";
268print "//  RegexTableEl       represents the structure of a row in the transition table\n";
269print "//                     for the pattern parser state machine.\n";
270print "//-------------------------------------------------------------------------------\n";
271print "struct RegexTableEl {\n";
272print "    Regex_PatternParseAction      fAction;\n";
273print "    uint8_t                       fCharClass;       // 0-127:    an individual ASCII character\n";
274print "                                                    // 128-255:  character class index\n";
275print "    uint8_t                       fNextState;       // 0-250:    normal next-state numbers\n";
276print "                                                    // 255:      pop next-state from stack.\n";
277print "    uint8_t                       fPushState;\n";
278print "    UBool                         fNextChar;\n";
279print "};\n\n";
280
281#
282# emit the state transition table
283#
284print "static const struct RegexTableEl gRuleParseStateTable[] = {\n";
285print "    {doNOP, 0, 0, 0, TRUE}\n";    # State 0 is a dummy.  Real states start with index = 1.
286for ($state=1; $state < $num_states; $state++) {
287    print "    , {$state_func_name[$state],";
288    if ($state_literal_chars[$state] ne "") {
289        $c = $state_literal_chars[$state];
290        printf(" %d /* $c */,", ord($c));   #  use numeric value, so EBCDIC machines are ok.
291    }else {
292        print " $charClasses{$state_char_class[$state]},";
293    }
294    print " $states{$state_dest_state[$state]},";
295
296    # The push-state field is optional.  If omitted, fill field with a zero, which flags
297    #   the state machine that there is no push state.
298    if ($state_push_state[$state] eq "") {
299        print "0, ";
300    } else {
301        print " $states{$state_push_state[$state]},";
302    }
303    print " $state_flag[$state]} ";
304
305    # Put out a C++ comment showing the number (index) of this state row,
306    #   and, if this is the first row of the table for this state, the state name.
307    print "    //  $state ";
308    if ($stateNames[$state] ne "") {
309        print "     $stateNames[$state]";
310    }
311    print "\n";
312};
313print " };\n";
314
315
316#
317# emit a mapping array from state numbers to state names.
318#
319#    This array is used for producing debugging output from the pattern parser.
320#
321print "static const char * const RegexStateNames[] = {";
322for ($state=0; $state<$num_states; $state++) {
323    if ($stateNames[$state] ne "") {
324        print "     \"$stateNames[$state]\",\n";
325    } else {
326        print "    0,\n";
327    }
328}
329print "    0};\n\n";
330
331print "U_NAMESPACE_END\n";
332print "#endif\n";
333
334
335
336