1 /*****************************************************************************
2  *  Written by Chris Dunlap <cdunlap@llnl.gov>.
3  *  Copyright (C) 2007-2018 Lawrence Livermore National Security, LLC.
4  *  Copyright (C) 2001-2007 The Regents of the University of California.
5  *  UCRL-CODE-2002-009.
6  *
7  *  This file is part of ConMan: The Console Manager.
8  *  For details, see <https://dun.github.io/conman/>.
9  *
10  *  ConMan is free software: you can redistribute it and/or modify it under
11  *  the terms of the GNU General Public License as published by the Free
12  *  Software Foundation, either version 3 of the License, or (at your option)
13  *  any later version.
14  *
15  *  ConMan is distributed in the hope that it will be useful, but WITHOUT
16  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
17  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
18  *  for more details.
19  *
20  *  You should have received a copy of the GNU General Public License along
21  *  with ConMan.  If not, see <http://www.gnu.org/licenses/>.
22  *****************************************************************************/
23 
24 
25 #ifndef _LEX_H
26 #define _LEX_H
27 
28 
29 /*****************************************************************************\
30  *  Laws of the Lexer:
31  *----------------------------------------------------------------------------
32  *  - Whitespace is ignored.
33  *  - Comments are ignored (from the pound char to the newline).
34  *  - Lines may be terminated by either carriage-returns (CR),
35  *    linefeeds (LF), or carriage-return/linefeed (CR/LF) pairs.
36  *  - A newline may be escaped by immediately preceding it with a backslash.
37  *  - Integers may begin with either a plus or minus, and contain only digits.
38  *  - Strings may be single-quoted or double-quoted.
39  *  - Strings cannot contain CRs or LFs.
40  *  - Unquoted strings are sequences of letters, digits, and underscores;
41  *    they may not begin with a digit (just like a C identifier).
42  *  - Tokens are unquoted case-insensitive strings.
43 \*****************************************************************************/
44 
45 
46 /***********\
47 **  Notes  **
48 \***********/
49 
50 /*  When a memory allocation request fails, the lexer returns out_of_memory().
51  *  By default, this is a macro definition that returns NULL; this macro may
52  *  be redefined to invoke another routine instead.  Furthermore, if WITH_OOMF
53  *  is defined, this macro will not be defined and the lexer will expect an
54  *  external Out-Of-Memory Function to be defined.
55  */
56 
57 
58 /***************\
59 **  Constants  **
60 \***************/
61 
62 #define LEX_MAX_STR 1024                /* max length of lexer string        */
63 
64 enum common_tokens {
65     LEX_ERR = -1,                       /* lex error token                   */
66     LEX_EOF = 0,                        /* end-of-file/buffer token          */
67     LEX_EOL = 256,                      /* end-of-line token                 */
68     LEX_INT,                            /* integer token: ([+-]?[0-9]+)      */
69     LEX_STR,                            /* string token                      */
70     LEX_TOK_OFFSET                      /* enum value at which toks[] begin  */
71 };
72 
73 
74 /****************\
75 **  Data Types  **
76 \****************/
77 
78 typedef struct lexer_state *Lex;
79 /*
80  *  Lex opaque data type.
81  */
82 
83 
84 /************\
85 **  Macros  **
86 \************/
87 
88 #define LEX_TOK2STR(tokstrs,tok) ((tokstrs)[(tok) - LEX_TOK_OFFSET])
89 /*
90  *  Returns a string in the (tokstrs) array corresponding to the token (tok).
91  *  Only use when (tok) is known to be a valid array index corresponding to a
92  *    string in the (tokstrs) array of strings since no bounds-checking is
93  *    performed.
94  */
95 
96 
97 /**********************\
98 **  Lexing Functions  **
99 \**********************/
100 
101 Lex lex_create(void *buf, char *toks[]);
102 /*
103  *  Creates and returns a new lexer, or out_of_memory() on failure.
104  *  The text to be lexed is specified by the NUL-terminated buffer (buf);
105  *    this buffer WILL NOT be modified by the lexer.
106  *  The NULL-terminated array of strings (toks) defines the set of tokens
107  *    that will be recognized by the lexer; these strings must be listed
108  *    in a case-insensitive ascending order (ie, according to strcasecmp).
109  *  Note: Abadoning a lexer without calling lex_destroy() will result
110  *    in a memory leak.
111  */
112 
113 void lex_destroy(Lex l);
114 /*
115  *  Destroys lexer (l), freeing memory used for the lexer itself.
116  */
117 
118 int lex_next(Lex l);
119 /*
120  *  Returns the next token in the buffer given to lex_create()
121  *    according to the Laws of the Lexer.
122  *  Single-character tokens (eg, punctuation) are specified by
123  *    their ASCII code.  Common tokens are specified by the
124  *    common_token enumeration.  Tokens specified by the (toks)
125  *    array of strings begin at LEX_TOK_OFFSET.
126  */
127 
128 int lex_prev(Lex l);
129 /*
130  *  Returns the last token returned by lex_next().
131  */
132 
133 int lex_line(Lex l);
134 /*
135  *  Returns the line number of the last token returned by lex_next().
136  */
137 
138 const char * lex_text(Lex l);
139 /*
140  *  Returns the string corresponding to the last token returned by lex_next().
141  */
142 
143 const char * lex_tok_to_str(Lex l, int tok);
144 /*
145  *  Returns the string from the lex_create() toks[] array corresponding to the
146  *    token (tok), or NULL if tok is outside of the toks[] array bounds.
147  */
148 
149 
150 /*************************\
151 **  Auxiliary Functions  **
152 \*************************/
153 
154 char * lex_encode(char *str);
155 /*
156  *  Encodes the string (str) so that it may safely be used by the lexer.
157  *    This is needed if the string may contain quote characters.
158  *    The string cannot be a constant as it will be modified in place.
159  *  Returns the encoded string.
160  */
161 
162 char * lex_decode(char *str);
163 /*
164  *  Decodes the string (str) that has been encoded with lex_encode().
165  *    The string cannot be a constant as it will be modified in place.
166  *  Returns the decoded string.
167  */
168 
169 
170 /********************\
171 **  Test Functions  **
172 \********************/
173 
174 void lex_parse_test(char *buf, char *toks[]);
175 /*
176  *  Example code that tokenizes the buffer (buf) based upon the
177  *    NULL-terminated array of strings (toks) that defines the
178  *    set of recognized tokens.
179  */
180 
181 
182 #endif /* !_LEX_H */
183