1 /***************************************************************************** 2 * Written by Chris Dunlap <cdunlap@llnl.gov>. 3 * Copyright (C) 2007-2018 Lawrence Livermore National Security, LLC. 4 * Copyright (C) 2001-2007 The Regents of the University of California. 5 * UCRL-CODE-2002-009. 6 * 7 * This file is part of ConMan: The Console Manager. 8 * For details, see <https://dun.github.io/conman/>. 9 * 10 * ConMan is free software: you can redistribute it and/or modify it under 11 * the terms of the GNU General Public License as published by the Free 12 * Software Foundation, either version 3 of the License, or (at your option) 13 * any later version. 14 * 15 * ConMan is distributed in the hope that it will be useful, but WITHOUT 16 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 17 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 18 * for more details. 19 * 20 * You should have received a copy of the GNU General Public License along 21 * with ConMan. If not, see <http://www.gnu.org/licenses/>. 22 *****************************************************************************/ 23 24 25 #ifndef _LEX_H 26 #define _LEX_H 27 28 29 /*****************************************************************************\ 30 * Laws of the Lexer: 31 *---------------------------------------------------------------------------- 32 * - Whitespace is ignored. 33 * - Comments are ignored (from the pound char to the newline). 34 * - Lines may be terminated by either carriage-returns (CR), 35 * linefeeds (LF), or carriage-return/linefeed (CR/LF) pairs. 36 * - A newline may be escaped by immediately preceding it with a backslash. 37 * - Integers may begin with either a plus or minus, and contain only digits. 38 * - Strings may be single-quoted or double-quoted. 39 * - Strings cannot contain CRs or LFs. 40 * - Unquoted strings are sequences of letters, digits, and underscores; 41 * they may not begin with a digit (just like a C identifier). 42 * - Tokens are unquoted case-insensitive strings. 43 \*****************************************************************************/ 44 45 46 /***********\ 47 ** Notes ** 48 \***********/ 49 50 /* When a memory allocation request fails, the lexer returns out_of_memory(). 51 * By default, this is a macro definition that returns NULL; this macro may 52 * be redefined to invoke another routine instead. Furthermore, if WITH_OOMF 53 * is defined, this macro will not be defined and the lexer will expect an 54 * external Out-Of-Memory Function to be defined. 55 */ 56 57 58 /***************\ 59 ** Constants ** 60 \***************/ 61 62 #define LEX_MAX_STR 1024 /* max length of lexer string */ 63 64 enum common_tokens { 65 LEX_ERR = -1, /* lex error token */ 66 LEX_EOF = 0, /* end-of-file/buffer token */ 67 LEX_EOL = 256, /* end-of-line token */ 68 LEX_INT, /* integer token: ([+-]?[0-9]+) */ 69 LEX_STR, /* string token */ 70 LEX_TOK_OFFSET /* enum value at which toks[] begin */ 71 }; 72 73 74 /****************\ 75 ** Data Types ** 76 \****************/ 77 78 typedef struct lexer_state *Lex; 79 /* 80 * Lex opaque data type. 81 */ 82 83 84 /************\ 85 ** Macros ** 86 \************/ 87 88 #define LEX_TOK2STR(tokstrs,tok) ((tokstrs)[(tok) - LEX_TOK_OFFSET]) 89 /* 90 * Returns a string in the (tokstrs) array corresponding to the token (tok). 91 * Only use when (tok) is known to be a valid array index corresponding to a 92 * string in the (tokstrs) array of strings since no bounds-checking is 93 * performed. 94 */ 95 96 97 /**********************\ 98 ** Lexing Functions ** 99 \**********************/ 100 101 Lex lex_create(void *buf, char *toks[]); 102 /* 103 * Creates and returns a new lexer, or out_of_memory() on failure. 104 * The text to be lexed is specified by the NUL-terminated buffer (buf); 105 * this buffer WILL NOT be modified by the lexer. 106 * The NULL-terminated array of strings (toks) defines the set of tokens 107 * that will be recognized by the lexer; these strings must be listed 108 * in a case-insensitive ascending order (ie, according to strcasecmp). 109 * Note: Abadoning a lexer without calling lex_destroy() will result 110 * in a memory leak. 111 */ 112 113 void lex_destroy(Lex l); 114 /* 115 * Destroys lexer (l), freeing memory used for the lexer itself. 116 */ 117 118 int lex_next(Lex l); 119 /* 120 * Returns the next token in the buffer given to lex_create() 121 * according to the Laws of the Lexer. 122 * Single-character tokens (eg, punctuation) are specified by 123 * their ASCII code. Common tokens are specified by the 124 * common_token enumeration. Tokens specified by the (toks) 125 * array of strings begin at LEX_TOK_OFFSET. 126 */ 127 128 int lex_prev(Lex l); 129 /* 130 * Returns the last token returned by lex_next(). 131 */ 132 133 int lex_line(Lex l); 134 /* 135 * Returns the line number of the last token returned by lex_next(). 136 */ 137 138 const char * lex_text(Lex l); 139 /* 140 * Returns the string corresponding to the last token returned by lex_next(). 141 */ 142 143 const char * lex_tok_to_str(Lex l, int tok); 144 /* 145 * Returns the string from the lex_create() toks[] array corresponding to the 146 * token (tok), or NULL if tok is outside of the toks[] array bounds. 147 */ 148 149 150 /*************************\ 151 ** Auxiliary Functions ** 152 \*************************/ 153 154 char * lex_encode(char *str); 155 /* 156 * Encodes the string (str) so that it may safely be used by the lexer. 157 * This is needed if the string may contain quote characters. 158 * The string cannot be a constant as it will be modified in place. 159 * Returns the encoded string. 160 */ 161 162 char * lex_decode(char *str); 163 /* 164 * Decodes the string (str) that has been encoded with lex_encode(). 165 * The string cannot be a constant as it will be modified in place. 166 * Returns the decoded string. 167 */ 168 169 170 /********************\ 171 ** Test Functions ** 172 \********************/ 173 174 void lex_parse_test(char *buf, char *toks[]); 175 /* 176 * Example code that tokenizes the buffer (buf) based upon the 177 * NULL-terminated array of strings (toks) that defines the 178 * set of recognized tokens. 179 */ 180 181 182 #endif /* !_LEX_H */ 183