1#
2#  sources.py
3#
4#    Convert source code comments to multi-line blocks (library file).
5#
6#  Copyright 2002-2018 by
7#  David Turner.
8#
9#  This file is part of the FreeType project, and may only be used,
10#  modified, and distributed under the terms of the FreeType project
11#  license, LICENSE.TXT.  By continuing to use, modify, or distribute
12#  this file you indicate that you have read the license and
13#  understand and accept it fully.
14
15#
16# This library file contains definitions of classes needed to decompose C
17# source code files into a series of multi-line `blocks'.  There are two
18# kinds of blocks.
19#
20#   - Normal blocks, which contain source code or ordinary comments.
21#
22#   - Documentation blocks, which have restricted formatting, and whose text
23#     always start with a documentation markup tag like `<Function>',
24#     `<Type>', etc.
25#
26# The routines to process the content of documentation blocks are contained
27# in file `content.py'; the classes and methods found here only deal with
28# text parsing and basic documentation block extraction.
29#
30
31
32import fileinput, re, string
33
34
35################################################################
36##
37##  SOURCE BLOCK FORMAT CLASS
38##
39##  A simple class containing compiled regular expressions to detect
40##  potential documentation format block comments within C source code.
41##
42##  The `column' pattern must contain a group to `unbox' the content of
43##  documentation comment blocks.
44##
45##  Later on, paragraphs are converted to long lines, which simplifies the
46##  regular expressions that act upon the text.
47##
48class  SourceBlockFormat:
49
50    def  __init__( self, id, start, column, end ):
51        """Create a block pattern, used to recognize special documentation
52           blocks."""
53        self.id     = id
54        self.start  = re.compile( start, re.VERBOSE )
55        self.column = re.compile( column, re.VERBOSE )
56        self.end    = re.compile( end, re.VERBOSE )
57
58
59#
60# Format 1 documentation comment blocks.
61#
62#    /************************************/ (at least 2 asterisks)
63#    /*                                  */
64#    /*                                  */
65#    /*                                  */
66#    /************************************/ (at least 2 asterisks)
67#
68start = r'''
69  \s*      # any number of whitespace
70  /\*{2,}/ # followed by '/' and at least two asterisks then '/'
71  \s*$     # probably followed by whitespace
72'''
73
74column = r'''
75  \s*      # any number of whitespace
76  /\*{1}   # followed by '/' and precisely one asterisk
77  ([^*].*) # followed by anything (group 1)
78  \*{1}/   # followed by one asterisk and a '/'
79  \s*$     # probably followed by whitespace
80'''
81
82re_source_block_format1 = SourceBlockFormat( 1, start, column, start )
83
84
85#
86# Format 2 documentation comment blocks.
87#
88#    /************************************ (at least 2 asterisks)
89#     *
90#     *                                    (1 asterisk)
91#     *
92#     */                                   (1 or more asterisks)
93#
94start = r'''
95  \s*     # any number of whitespace
96  /\*{2,} # followed by '/' and at least two asterisks
97  \s*$    # probably followed by whitespace
98'''
99
100column = r'''
101  \s*           # any number of whitespace
102  \*{1}(?![*/]) # followed by precisely one asterisk not followed by `/'
103  (.*)          # then anything (group1)
104'''
105
106end = r'''
107  \s*  # any number of whitespace
108  \*+/ # followed by at least one asterisk, then '/'
109'''
110
111re_source_block_format2 = SourceBlockFormat( 2, start, column, end )
112
113
114#
115# The list of supported documentation block formats.  We could add new ones
116# quite easily.
117#
118re_source_block_formats = [re_source_block_format1, re_source_block_format2]
119
120
121#
122# The following regular expressions correspond to markup tags within the
123# documentation comment blocks.  They are equivalent despite their different
124# syntax.
125#
126# A markup tag consists of letters or character `-', to be found in group 1.
127#
128# Notice that a markup tag _must_ begin a new paragraph.
129#
130re_markup_tag1 = re.compile( r'''\s*<((?:\w|-)*)>''' )  # <xxxx> format
131re_markup_tag2 = re.compile( r'''\s*@((?:\w|-)*):''' )  # @xxxx: format
132
133#
134# The list of supported markup tags.  We could add new ones quite easily.
135#
136re_markup_tags = [re_markup_tag1, re_markup_tag2]
137
138
139#
140# A regular expression to detect a cross reference, after markup tags have
141# been stripped off.
142#
143# Two syntax forms are supported:
144#
145#   @<name>
146#   @<name>[<id>]
147#
148# where both `<name>' and `<id>' consist of alphanumeric characters, `_',
149# and `-'.  Use `<id>' if there are multiple, valid `<name>' entries.
150#
151# Example: @foo[bar]
152#
153re_crossref = re.compile( r"""
154                            @
155                            (?P<name>(?:\w|-)+
156                                     (?:\[(?:\w|-)+\])?)
157                            (?P<rest>.*)
158                          """, re.VERBOSE )
159
160#
161# Two regular expressions to detect italic and bold markup, respectively.
162# Group 1 is the markup, group 2 the rest of the line.
163#
164# Note that the markup is limited to words consisting of letters, digits,
165# the characters `_' and `-', or an apostrophe (but not as the first
166# character).
167#
168re_italic = re.compile( r"_((?:\w|-)(?:\w|'|-)*)_(.*)" )     #  _italic_
169re_bold   = re.compile( r"\*((?:\w|-)(?:\w|'|-)*)\*(.*)" )   #  *bold*
170
171#
172# This regular expression code to identify an URL has been taken from
173#
174#   https://mail.python.org/pipermail/tutor/2002-September/017228.html
175#
176# (with slight modifications).
177#
178urls = r'(?:https?|telnet|gopher|file|wais|ftp)'
179ltrs = r'\w'
180gunk = r'/#~:.?+=&%@!\-'
181punc = r'.:?\-'
182any  = "%(ltrs)s%(gunk)s%(punc)s" % { 'ltrs' : ltrs,
183                                      'gunk' : gunk,
184                                      'punc' : punc }
185url  = r"""
186         (
187           \b                    # start at word boundary
188           %(urls)s :            # need resource and a colon
189           [%(any)s] +?          # followed by one or more of any valid
190                                 # character, but be conservative and
191                                 # take only what you need to...
192           (?=                   # [look-ahead non-consumptive assertion]
193             [%(punc)s]*         # either 0 or more punctuation
194             (?:                 # [non-grouping parentheses]
195               [^%(any)s] | $    # followed by a non-url char
196                                 # or end of the string
197             )
198           )
199         )
200        """ % {'urls' : urls,
201               'any'  : any,
202               'punc' : punc }
203
204re_url = re.compile( url, re.VERBOSE | re.MULTILINE )
205
206#
207# A regular expression that stops collection of comments for the current
208# block.
209#
210re_source_sep = re.compile( r'\s*/\*\s*\*/' )   #  /* */
211
212#
213# A regular expression to find possible C identifiers while outputting
214# source code verbatim, covering things like `*foo' or `(bar'.  Group 1 is
215# the prefix, group 2 the identifier -- since we scan lines from left to
216# right, sequentially splitting the source code into prefix and identifier
217# is fully sufficient for our purposes.
218#
219re_source_crossref = re.compile( r'(\W*)(\w*)' )
220
221#
222# A regular expression that matches a list of reserved C source keywords.
223#
224re_source_keywords = re.compile( '''\\b ( typedef   |
225                                          struct    |
226                                          enum      |
227                                          union     |
228                                          const     |
229                                          char      |
230                                          int       |
231                                          short     |
232                                          long      |
233                                          void      |
234                                          signed    |
235                                          unsigned  |
236                                          \#include |
237                                          \#define  |
238                                          \#undef   |
239                                          \#if      |
240                                          \#ifdef   |
241                                          \#ifndef  |
242                                          \#else    |
243                                          \#endif   ) \\b''', re.VERBOSE )
244
245
246################################################################
247##
248##  SOURCE BLOCK CLASS
249##
250##  There are two important fields in a `SourceBlock' object.
251##
252##    self.lines
253##      A list of text lines for the corresponding block.
254##
255##    self.content
256##      For documentation comment blocks only, this is the block content
257##      that has been `unboxed' from its decoration.  This is `None' for all
258##      other blocks (i.e., sources or ordinary comments with no starting
259##      markup tag)
260##
261class  SourceBlock:
262
263    def  __init__( self, processor, filename, lineno, lines ):
264        self.processor = processor
265        self.filename  = filename
266        self.lineno    = lineno
267        self.lines     = lines[:]
268        self.format    = processor.format
269        self.content   = []
270
271        if self.format == None:
272            return
273
274        words = []
275
276        # extract comment lines
277        lines = []
278
279        for line0 in self.lines:
280            m = self.format.column.match( line0 )
281            if m:
282                lines.append( m.group( 1 ) )
283
284        # now, look for a markup tag
285        for l in lines:
286            l = string.strip( l )
287            if len( l ) > 0:
288                for tag in re_markup_tags:
289                    if tag.match( l ):
290                        self.content = lines
291                        return
292
293    def  location( self ):
294        return "(" + self.filename + ":" + repr( self.lineno ) + ")"
295
296    # debugging only -- not used in normal operations
297    def  dump( self ):
298        if self.content:
299            print( "{{{content start---" )
300            for l in self.content:
301                print( l )
302            print( "---content end}}}" )
303            return
304
305        fmt = ""
306        if self.format:
307            fmt = repr( self.format.id ) + " "
308
309        for line in self.lines:
310            print( line )
311
312
313################################################################
314##
315##  SOURCE PROCESSOR CLASS
316##
317##  The `SourceProcessor' is in charge of reading a C source file and
318##  decomposing it into a series of different `SourceBlock' objects.
319##
320##  A SourceBlock object consists of the following data.
321##
322##    - A documentation comment block using one of the layouts above.  Its
323##      exact format will be discussed later.
324##
325##    - Normal sources lines, including comments.
326##
327##
328class  SourceProcessor:
329
330    def  __init__( self ):
331        """Initialize a source processor."""
332        self.blocks   = []
333        self.filename = None
334        self.format   = None
335        self.lines    = []
336
337    def  reset( self ):
338        """Reset a block processor and clean up all its blocks."""
339        self.blocks = []
340        self.format = None
341
342    def  parse_file( self, filename ):
343        """Parse a C source file and add its blocks to the processor's
344           list."""
345        self.reset()
346
347        self.filename = filename
348
349        fileinput.close()
350        self.format = None
351        self.lineno = 0
352        self.lines  = []
353
354        for line in fileinput.input( filename ):
355            # strip trailing newlines, important on Windows machines!
356            if line[-1] == '\012':
357                line = line[0:-1]
358
359            if self.format == None:
360                self.process_normal_line( line )
361            else:
362                if self.format.end.match( line ):
363                    # A normal block end.  Add it to `lines' and create a
364                    # new block
365                    self.lines.append( line )
366                    self.add_block_lines()
367                elif self.format.column.match( line ):
368                    # A normal column line.  Add it to `lines'.
369                    self.lines.append( line )
370                else:
371                    # An unexpected block end.  Create a new block, but
372                    # don't process the line.
373                    self.add_block_lines()
374
375                    # we need to process the line again
376                    self.process_normal_line( line )
377
378        # record the last lines
379        self.add_block_lines()
380
381    def  process_normal_line( self, line ):
382        """Process a normal line and check whether it is the start of a new
383           block."""
384        for f in re_source_block_formats:
385            if f.start.match( line ):
386                self.add_block_lines()
387                self.format = f
388                self.lineno = fileinput.filelineno()
389
390        self.lines.append( line )
391
392    def  add_block_lines( self ):
393        """Add the current accumulated lines and create a new block."""
394        if self.lines != []:
395            block = SourceBlock( self,
396                                 self.filename,
397                                 self.lineno,
398                                 self.lines )
399
400            self.blocks.append( block )
401            self.format = None
402            self.lines  = []
403
404    # debugging only, not used in normal operations
405    def  dump( self ):
406        """Print all blocks in a processor."""
407        for b in self.blocks:
408            b.dump()
409
410# eof
411