1 /*
2  * This program source code file is part of KICAD, a free EDA CAD application.
3  *
4  * Copyright (C) 2007-2010 SoftPLC Corporation, Dick Hollenbeck <dick@softplc.com>
5  * Copyright (C) 2007-2021 Kicad Developers, see change_log.txt for contributors.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, you may find one here:
19  * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
20  * or you may search the http://www.gnu.org website for the version 2 license,
21  * or you may write to the Free Software Foundation, Inc.,
22  * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
23  */
24 
25 #ifndef DSNLEXER_H_
26 #define DSNLEXER_H_
27 
28 #include <cstdio>
29 #include <hashtables.h>
30 #include <string>
31 #include <vector>
32 
33 #include <richio.h>
34 
35 #ifndef SWIG
36 /**
37  * Hold a keyword string and its unique integer token.
38  */
39 struct KEYWORD
40 {
41     const char* name;       ///< unique keyword.
42     int         token;      ///< a zero based index into an array of KEYWORDs
43 };
44 #endif // SWIG
45 
46 // something like this macro can be used to help initialize a KEYWORD table.
47 // see SPECCTRA_DB::keywords[] as an example.
48 
49 //#define TOKDEF(x)    { #x, T_##x }
50 
51 
52 /**
53  * List all the DSN lexer's tokens that are supported in lexing.
54  *
55  * It is up to the parser if it wants also to support them.
56  */
57 enum DSN_SYNTAX_T
58 {
59     DSN_NONE         = -11,
60     DSN_COMMENT      = -10,
61     DSN_STRING_QUOTE = -9,
62     DSN_QUOTE_DEF    = -8,
63     DSN_DASH         = -7,
64     DSN_SYMBOL       = -6,
65     DSN_NUMBER       = -5,
66     DSN_RIGHT        = -4,  // right bracket, ')'
67     DSN_LEFT         = -3,  // left bracket, '('
68     DSN_STRING       = -2,  // a quoted string, stripped of the quotes
69     DSN_EOF          = -1   // special case for end of file
70 };
71 
72 
73 /**
74  * Implement a lexical analyzer for the SPECCTRA DSN file format.
75  *
76  * It reads lexical tokens from the current #LINE_READER through the #NextTok() function.
77  */
78 class DSNLEXER
79 {
80 public:
81 
82     /**
83      * Initialize a DSN lexer and prepares to read from aFile which is already open and has
84      * \a aFilename.
85      *
86      * @param aKeywordTable is an array of KEYWORDS holding \a aKeywordCount.  This
87      *  token table need not contain the lexer separators such as '(' ')', etc.
88      * @param aKeywordCount is the count of tokens in aKeywordTable.
89      * @param aFile is an open file, which will be closed when this is destructed.
90      * @param aFileName is the name of the file
91      */
92     DSNLEXER( const KEYWORD* aKeywordTable, unsigned aKeywordCount,
93               FILE* aFile, const wxString& aFileName );
94 
95     /**
96      * Initialize a DSN lexer and prepares to read from @a aSExpression.
97      *
98      * @param aKeywordTable is an array of KEYWORDS holding \a aKeywordCount.  This
99      *  token table need not contain the lexer separators such as '(' ')', etc.
100      * @param aKeywordCount is the count of tokens in aKeywordTable.
101      * @param aSExpression is text to feed through a STRING_LINE_READER
102      * @param aSource is a description of aSExpression, used for error reporting.
103      */
104     DSNLEXER( const KEYWORD* aKeywordTable, unsigned aKeywordCount,
105               const std::string& aSExpression, const wxString& aSource = wxEmptyString );
106 
107     /**
108      * Initialize a DSN lexer and prepares to read from @a aSExpression.
109      *
110      * Use this one without a keyword table with the DOM parser in ptree.h.
111      *
112      * @param aSExpression is text to feed through a #STRING_LINE_READER
113      * @param aSource is a description of aSExpression, used for error reporting.
114      */
115     DSNLEXER( const std::string& aSExpression, const wxString& aSource = wxEmptyString );
116 
117     /**
118      * Initialize a DSN lexer and prepares to read from @a aLineReader which is already
119      * open, and may be in use by other DSNLEXERs also.
120      *
121      * No ownership is taken of @a aLineReader. This enables it to be used by other DSNLEXERs.
122      *
123      * @param aKeywordTable is an array of #KEYWORDS holding \a aKeywordCount.  This
124      *  token table need not contain the lexer separators such as '(' ')', etc.
125      * @param aKeywordCount is the count of tokens in aKeywordTable.
126      * @param aLineReader is any subclassed instance of LINE_READER, such as
127      *  #STRING_LINE_READER or #FILE_LINE_READER.  No ownership is taken.
128      */
129     DSNLEXER( const KEYWORD* aKeywordTable, unsigned aKeywordCount,
130               LINE_READER* aLineReader = nullptr );
131 
132     virtual ~DSNLEXER();
133 
134     /**
135      * Reinit variables used during parsing, to ensure od states are not used in a new parsing
136      * must be called before parsing a new file after parsing an old file to avoid
137      * starting with some variables in a non initial state
138      */
139     void InitParserState();
140 
141     /**
142      * Usable only for DSN lexers which share the same #LINE_READER.
143      *
144      * Synchronizes the pointers handling the data read by the #LINE_READER.  Allows 2
145      * #DNSLEXER objects to share the same current line, when switching from a #DNSLEXER
146      * to another #DNSLEXER
147      * @param aLexer the model.
148      * @return true if the sync can be made ( at least the same line reader ).
149      */
150     bool SyncLineReaderWith( DSNLEXER& aLexer );
151 
152     /**
153      * Change the behavior of this lexer into or out of "specctra mode".
154      *
155      * If specctra mode, then:
156      *  -#) stringDelimiter can be changed.
157      *  -#) KiCad quoting protocol is not in effect.
158      *  -#) space_in_quoted_tokens is functional else none of the above are true.
159      *
160      * The default mode is non-specctra mode, meaning:
161      *  -#) stringDelimiter cannot be changed.
162      *  -#) KiCad quoting protocol is in effect.
163      *  -#) space_in_quoted_tokens is not functional.
164      */
165     void SetSpecctraMode( bool aMode );
166 
167     /**
168      * Manage a stack of LINE_READERs in order to handle nested file inclusion.
169      *
170      * This function pushes aLineReader onto the top of a stack of LINE_READERs and makes
171      * it the current #LINE_READER with its own #GetSource(), line number and line text.
172      * A grammar must be designed such that the "include" token (whatever its various names),
173      * and any of its parameters are not followed by anything on that same line,
174      * because PopReader always starts reading from a new line upon returning to
175      * the original #LINE_READER.
176      */
177     void PushReader( LINE_READER* aLineReader );
178 
179     /**
180      * Delete the top most #LINE_READER from an internal stack of LINE_READERs and
181      * in the case of #FILE_LINE_READER this means the associated FILE is closed.
182      *
183      * The most recently used former #LINE_READER on the stack becomes the
184      * current #LINE_READER and its previous position in its input stream and the
185      * its latest line number should pertain.  PopReader always starts reading
186      * from a new line upon returning to the previous #LINE_READER.  A pop is only
187      * possible if there are at least 2 #LINE_READERs on the stack, since popping
188      * the last one is not supported.
189      *
190      * @return the LINE_READER that was in use before the pop, or NULL
191      *   if there was not at least two readers on the stack and therefore the
192      *   pop failed.
193      */
194     LINE_READER* PopReader();
195 
196     /**
197      * Return the next token found in the input file or DSN_EOF when reaching the end of
198      * file.
199      *
200      * Users should wrap this function to return an enum to aid in grammar debugging while
201      * running under a debugger, but leave this lower level function returning an int (so
202      * the enum does not collide with another usage).
203      *
204      * @return the type of token found next.
205      * @throw IO_ERROR only if the #LINE_READER throws it.
206      */
207     int NextTok();
208 
209     /**
210      * Call #NextTok() and then verifies that the token read in satisfies #IsSymbol().
211      *
212      * @return the actual token read in.
213      * @throw IO_ERROR if the next token does not satisfy IsSymbol().
214      */
215     int NeedSYMBOL();
216 
217     /**
218      * Call #NextTok() and then verifies that the token read in satisfies bool IsSymbol() or
219      * the next token is #DSN_NUMBER.
220      *
221      * @return the actual token read in.
222      * @throw IO_ERROR if the next token does not satisfy the above test.
223      */
224     int NeedSYMBOLorNUMBER();
225 
226     /**
227      * Call #NextTok() and then verifies that the token read is type #DSN_NUMBER.
228      *
229      * @return the actual token read in.
230      * @throw IO_ERROR if the next token does not satisfy the above test.
231      */
232     int NeedNUMBER( const char* aExpectation );
233 
234     /**
235      * Return whatever #NextTok() returned the last time it was called.
236      */
CurTok()237     int CurTok() const
238     {
239         return curTok;
240     }
241 
242     /**
243      * Return whatever NextTok() returned the 2nd to last time it was called.
244      */
PrevTok()245     int PrevTok() const
246     {
247         return prevTok;
248     }
249 
250     /**
251      * Used to support "loose" matches (quoted tokens).
252      */
GetCurStrAsToken()253     int GetCurStrAsToken() const
254     {
255         return findToken( curText );
256     }
257 
258     /**
259      * Change the string delimiter from the default " to some other character and return
260      * the old value.
261      *
262      * @param aStringDelimiter The character in lowest 8 bits.
263      * @return The old delimiter in the lowest 8 bits.
264      */
SetStringDelimiter(char aStringDelimiter)265     char SetStringDelimiter( char aStringDelimiter )
266     {
267         int old = stringDelimiter;
268 
269         if( specctraMode )
270             stringDelimiter = aStringDelimiter;
271 
272         return old;
273     }
274 
275     /**
276      * Change the setting controlling whether a space in a quoted string isa terminator.
277      *
278      * @param val If true, means
279      */
SetSpaceInQuotedTokens(bool val)280     bool SetSpaceInQuotedTokens( bool val )
281     {
282         bool old = space_in_quoted_tokens;
283 
284         if( specctraMode )
285             space_in_quoted_tokens = val;
286 
287         return old;
288     }
289 
290     /**
291      * Change the handling of comments.
292      *
293      * If set true, comments are returned as single line strings with a terminating newline.
294      * Otherwise they are consumed by the lexer and not returned.
295      */
SetCommentsAreTokens(bool val)296     bool SetCommentsAreTokens( bool val )
297     {
298         bool old = commentsAreTokens;
299         commentsAreTokens = val;
300         return old;
301     }
302 
303     /**
304      * Check the next sequence of tokens and reads them into a wxArrayString if they are
305      * comments.
306      *
307      * Reading continues until a non-comment token is encountered, and such last read token
308      * remains as #CurTok() and as #CurText().  No push back or "un get" mechanism is used
309      * for this support.  Upon return you simply avoid calling NextTok() for the next token,
310      * but rather #CurTok().
311      *
312      * @return Heap allocated block of comments or NULL if none.  The caller owns the
313      *         allocation and must delete if not NULL.
314      */
315     wxArrayString* ReadCommentLines();
316 
317     /**
318      * Test a token to see if it is a symbol.
319      *
320      * This means it cannot be a special delimiter character such as #DSN_LEFT, #DSN_RIGHT,
321      * #DSN_QUOTE, etc.  It may however, coincidentally match a keyword and still be a symbol.
322      */
323     static bool IsSymbol( int aTok );
324 
325     /**
326      * Throw an #IO_ERROR exception with an input file specific error message.
327      *
328      * @param aTok is the token/keyword type which was expected at the current input location.
329      * @throw IO_ERROR with the location within the input file of the problem.
330      */
331     void Expecting( int aTok ) const;
332 
333     /**
334      * Throw an #IO_ERROR exception with an input file specific error message.
335      *
336      * @param aTokenList is the token/keyword type which was expected at the
337      *                   current input location, e.g.  "pin|graphic|property".
338      * @throw IO_ERROR with the location within the input file of the problem.
339      */
340     void Expecting( const char* aTokenList ) const;
341 
342     /**
343      * Throw an #IO_ERROR exception with an input file specific error message.
344      *
345      * @param aTok is the token/keyword type which was not expected at the
346      *             current input location.
347      * @throw IO_ERROR with the location within the input file of the problem.
348      */
349     void Unexpected( int aTok ) const;
350 
351     /**
352      * Throw an #IO_ERROR exception with an input file specific error message.
353      *
354      * @param aToken is the token which was not expected at the current input location.
355      * @throw IO_ERROR with the location within the input file of the problem.
356      */
357     void Unexpected( const char* aToken ) const;
358 
359     /**
360      * Throw an #IO_ERROR exception with a message saying specifically that \a aTok
361      * is a duplicate of one already seen in current context.
362      *
363      * @param aTok is the token/keyword type which was not expected at the current input
364      *             location.
365      * @throw IO_ERROR with the location within the input file of the problem.
366      */
367     void Duplicate( int aTok );
368 
369     /**
370      * Call #NextTok() and then verifies that the token read in is a #DSN_LEFT.
371      *
372      * @throw IO_ERROR if the next token is not a #DSN_LEFT
373      */
374     void NeedLEFT();
375 
376     /**
377      * Call #NextTok() and then verifies that the token read in is a #DSN_RIGHT.
378      *
379      * @throw IO_ERROR if the next token is not a #DSN_RIGHT
380      */
381     void NeedRIGHT();
382 
383     /**
384      * Return the C string representation of a #DSN_T value.
385      */
386     const char* GetTokenText( int aTok ) const;
387 
388     /**
389      * Return a quote wrapped wxString representation of a token value.
390      */
391     wxString GetTokenString( int aTok ) const;
392 
393     static const char* Syntax( int aTok );
394 
395     /**
396      * Return a pointer to the current token's text.
397      */
CurText()398     const char* CurText() const
399     {
400         return curText.c_str();
401     }
402 
403     /**
404      * Return a reference to current token in std::string form.
405      */
CurStr()406     const std::string& CurStr() const
407     {
408         return curText;
409     }
410 
411     /**
412      * Return the current token text as a wxString, assuming that the input byte stream
413      * is UTF8 encoded.
414      */
FromUTF8()415     wxString FromUTF8() const
416     {
417         return wxString::FromUTF8( curText.c_str() );
418     }
419 
420     /**
421      * Return the current line number within my #LINE_READER.
422      */
CurLineNumber()423     int CurLineNumber() const
424     {
425         return reader->LineNumber();
426     }
427 
428     /**
429      * Return the current line of text from which the #CurText() would return its token.
430      */
CurLine()431     const char* CurLine() const
432     {
433         return (const char*)(*reader);
434     }
435 
436     /**
437      * Return the current #LINE_READER source.
438      *
439      * @return source of the lines of text, e.g. a filename or "clipboard".
440      */
CurSource()441     const wxString& CurSource() const
442     {
443         return reader->GetSource();
444     }
445 
446     /**
447      * Return the byte offset within the current line, using a 1 based index.
448      *
449      * @return a one based index into the current line.
450      */
CurOffset()451     int CurOffset() const
452     {
453         return curOffset + 1;
454     }
455 
456 #ifndef SWIG
457 
458 protected:
459     void init();
460 
readLine()461     int readLine()
462     {
463         if( reader )
464         {
465             reader->ReadLine();
466 
467             unsigned len = reader->Length();
468 
469             // start may have changed in ReadLine(), which can resize and
470             // relocate reader's line buffer.
471             start = reader->Line();
472 
473             next  = start;
474             limit = next + len;
475 
476             return len;
477         }
478         return 0;
479     }
480 
481     /**
482      * Take @a aToken string and looks up the string in the keywords table.
483      *
484      * @param aToken is a string to lookup in the keywords table.
485      * @return with a value from the enum #DSN_T matching the keyword text,
486      *         or #DSN_SYMBOL if @a aToken is not in the keywords table.
487      */
488     int findToken( const std::string& aToken ) const;
489 
isStringTerminator(char cc)490     bool isStringTerminator( char cc ) const
491     {
492         if( !space_in_quoted_tokens && cc == ' ' )
493             return true;
494 
495         if( cc == stringDelimiter )
496             return true;
497 
498         return false;
499     }
500 
501     bool                iOwnReaders;            ///< on readerStack, should I delete them?
502     const char*         start;
503     const char*         next;
504     const char*         limit;
505     char                dummy[1];               ///< when there is no reader.
506 
507     typedef std::vector<LINE_READER*>  READER_STACK;
508 
509     READER_STACK        readerStack;            ///< all the LINE_READERs by pointer.
510 
511     ///< no ownership. ownership is via readerStack, maybe, if iOwnReaders
512     LINE_READER*        reader;
513 
514     bool                specctraMode;           ///< if true, then:
515                                                 ///< 1) stringDelimiter can be changed
516                                                 ///< 2) Kicad quoting protocol is not in effect
517                                                 ///< 3) space_in_quoted_tokens is functional
518                                                 ///< else not.
519 
520     char                stringDelimiter;
521     bool                space_in_quoted_tokens; ///< blank spaces within quoted strings
522 
523     bool                commentsAreTokens;      ///< true if should return comments as tokens
524 
525     int                 prevTok;                ///< curTok from previous NextTok() call.
526     int                 curOffset;              ///< offset within current line of the current token
527 
528     int                 curTok;                 ///< the current token obtained on last NextTok()
529     std::string         curText;                ///< the text of the current token
530 
531     const KEYWORD*      keywords;               ///< table sorted by CMake for bsearch()
532     unsigned            keywordCount;           ///< count of keywords table
533     KEYWORD_MAP         keyword_hash;           ///< fast, specialized "C string" hashtable
534 #endif // SWIG
535 };
536 
537 #endif  // DSNLEXER_H_
538