tools/docmaker/sources.py

#  Sources (c) 2002-2004, 2006-2009, 2012, 2013
#    David Turner <david@freetype.org>
#
#
# this file contains definitions of classes needed to decompose
# C sources files into a series of multi-line "blocks". There are
# two kinds of blocks:
#
#   - normal blocks, which contain source code or ordinary comments
#
#   - documentation blocks, which have restricted formatting, and
#     whose text always start with a documentation markup tag like
#     "<Function>", "<Type>", etc..
#
# the routines used to process the content of documentation blocks
# are not contained here, but in "content.py"
#
# the classes and methods found here only deal with text parsing
# and basic documentation block extraction
#

import fileinput, re, sys, os, string


################################################################
##
##  BLOCK FORMAT PATTERN
##
##   A simple class containing compiled regular expressions used
##   to detect potential documentation format block comments within
##   C source code
##
##   note that the 'column' pattern must contain a group that will
##   be used to "unbox" the content of documentation comment blocks
##
class  SourceBlockFormat:

    def  __init__( self, id, start, column, end ):
        """create a block pattern, used to recognize special documentation blocks"""
        self.id     = id
        self.start  = re.compile( start, re.VERBOSE )
        self.column = re.compile( column, re.VERBOSE )
        self.end    = re.compile( end, re.VERBOSE )


#
# format 1 documentation comment blocks look like the following:
#
#    /************************************/
#    /*                                  */
#    /*                                  */
#    /*                                  */
#    /************************************/
#
# we define a few regular expressions here to detect them
#

start = r'''
  \s*      # any number of whitespace
  /\*{2,}/ # followed by '/' and at least two asterisks then '/'
  \s*$     # probably followed by whitespace
'''

column = r'''
  \s*      # any number of whitespace
  /\*{1}   # followed by '/' and precisely one asterisk
  ([^*].*) # followed by anything (group 1)
  \*{1}/   # followed by one asterisk and a '/'
  \s*$     # probably followed by whitespace
'''

re_source_block_format1 = SourceBlockFormat( 1, start, column, start )


#
# format 2 documentation comment blocks look like the following:
#
#    /************************************ (at least 2 asterisks)
#     *
#     *
#     *
#     *
#     **/       (1 or more asterisks at the end)
#
# we define a few regular expressions here to detect them
#
start = r'''
  \s*     # any number of whitespace
  /\*{2,} # followed by '/' and at least two asterisks
  \s*$    # probably followed by whitespace
'''

column = r'''
  \s*        # any number of whitespace
  \*{1}(?!/) # followed by precisely one asterisk not followed by `/'
  (.*)       # then anything (group1)
'''

end = r'''
  \s*  # any number of whitespace
  \*+/ # followed by at least one asterisk, then '/'
'''

re_source_block_format2 = SourceBlockFormat( 2, start, column, end )


#
# the list of supported documentation block formats, we could add new ones
# relatively easily
#
re_source_block_formats = [re_source_block_format1, re_source_block_format2]


#
# the following regular expressions corresponds to markup tags
# within the documentation comment blocks. they're equivalent
# despite their different syntax
#
# notice how each markup tag _must_ begin a new line
#
re_markup_tag1 = re.compile( r'''\s*<((?:\w|-)*)>''' )  # <xxxx> format
re_markup_tag2 = re.compile( r'''\s*@((?:\w|-)*):''' )  # @xxxx: format

#
# the list of supported markup tags, we could add new ones relatively
# easily
#
re_markup_tags = [re_markup_tag1, re_markup_tag2]

#
# used to detect a cross-reference, after markup tags have been stripped
#
re_crossref = re.compile( r'@((?:\w|-)*)(.*)' )    #  @foo

#
# used to detect italic and bold styles in paragraph text
#
re_italic = re.compile( r"_(\w(\w|')*)_(.*)" )     #  _italic_
re_bold   = re.compile( r"\*(\w(\w|')*)\*(.*)" )   #  *bold*

#
# this regular expression code to identify an URL has been taken from
#
#   http://mail.python.org/pipermail/tutor/2002-September/017228.html
#
# (with slight modifications)
#

urls = r'(?:https?|telnet|gopher|file|wais|ftp)'
ltrs = r'\w'
gunk = r'/#~:.?+=&%@!\-'
punc = r'.:?\-'
any  = "%(ltrs)s%(gunk)s%(punc)s" % { 'ltrs' : ltrs,
                                      'gunk' : gunk,
                                      'punc' : punc }
url  = r"""
         (
           \b                    # start at word boundary
           %(urls)s :            # need resource and a colon
           [%(any)s] +?          # followed by one or more of any valid
                                 # character, but be conservative and
                                 # take only what you need to...
           (?=                   # [look-ahead non-consumptive assertion]
             [%(punc)s]*         # either 0 or more punctuation
             (?:                 # [non-grouping parentheses]
               [^%(any)s] | $    # followed by a non-url char
                                 # or end of the string
             )
           )
         )
        """ % {'urls' : urls,
               'any'  : any,
               'punc' : punc }

re_url = re.compile( url, re.VERBOSE | re.MULTILINE )

#
# used to detect the end of commented source lines
#
re_source_sep = re.compile( r'\s*/\*\s*\*/' )

#
# used to perform cross-reference within source output
#
re_source_crossref = re.compile( r'(\W*)(\w*)' )

#
# a list of reserved source keywords
#
re_source_keywords = re.compile( '''\\b ( typedef   |
                                          struct    |
                                          enum      |
                                          union     |
                                          const     |
                                          char      |
                                          int       |
                                          short     |
                                          long      |
                                          void      |
                                          signed    |
                                          unsigned  |
                                          \#include |
                                          \#define  |
                                          \#undef   |
                                          \#if      |
                                          \#ifdef   |
                                          \#ifndef  |
                                          \#else    |
                                          \#endif   ) \\b''', re.VERBOSE )


################################################################
##
##  SOURCE BLOCK CLASS
##
##   A SourceProcessor is in charge of reading a C source file
##   and decomposing it into a series of different "SourceBlocks".
##   each one of these blocks can be made of the following data:
##
##   - A documentation comment block that starts with "/**" and
##     whose exact format will be discussed later
##
##   - normal sources lines, including comments
##
##   the important fields in a text block are the following ones:
##
##     self.lines   : a list of text lines for the corresponding block
##
##     self.content : for documentation comment blocks only, this is the
##                    block content that has been "unboxed" from its
##                    decoration. This is None for all other blocks
##                    (i.e. sources or ordinary comments with no starting
##                     markup tag)
##
class  SourceBlock:

    def  __init__( self, processor, filename, lineno, lines ):
        self.processor = processor
        self.filename  = filename
        self.lineno    = lineno
        self.lines     = lines[:]
        self.format    = processor.format
        self.content   = []

        if self.format == None:
            return

        words = []

        # extract comment lines
        lines = []

        for line0 in self.lines:
            m = self.format.column.match( line0 )
            if m:
                lines.append( m.group( 1 ) )

        # now, look for a markup tag
        for l in lines:
            l = string.strip( l )
            if len( l ) > 0:
                for tag in re_markup_tags:
                    if tag.match( l ):
                        self.content = lines
                        return

    def  location( self ):
        return "(" + self.filename + ":" + repr( self.lineno ) + ")"

    # debugging only - not used in normal operations
    def  dump( self ):
        if self.content:
            print "{{{content start---"
            for l in self.content:
                print l
            print "---content end}}}"
            return

        fmt = ""
        if self.format:
            fmt = repr( self.format.id ) + " "

        for line in self.lines:
            print line


################################################################
##
##  SOURCE PROCESSOR CLASS
##
##   The SourceProcessor is in charge of reading a C source file
##   and decomposing it into a series of different "SourceBlock"
##   objects.
##
##   each one of these blocks can be made of the following data:
##
##   - A documentation comment block that starts with "/**" and
##     whose exact format will be discussed later
##
##   - normal sources lines, include comments
##
##
class  SourceProcessor:

    def  __init__( self ):
        """initialize a source processor"""
        self.blocks   = []
        self.filename = None
        self.format   = None
        self.lines    = []

    def  reset( self ):
        """reset a block processor, clean all its blocks"""
        self.blocks = []
        self.format = None

    def  parse_file( self, filename ):
        """parse a C source file, and add its blocks to the processor's list"""
        self.reset()

        self.filename = filename

        fileinput.close()
        self.format = None
        self.lineno = 0
        self.lines  = []

        for line in fileinput.input( filename ):
            # strip trailing newlines, important on Windows machines!
            if line[-1] == '\012':
                line = line[0:-1]

            if self.format == None:
                self.process_normal_line( line )
            else:
                if self.format.end.match( line ):
                    # that's a normal block end, add it to 'lines' and
                    # create a new block
                    self.lines.append( line )
                    self.add_block_lines()
                elif self.format.column.match( line ):
                    # that's a normal column line, add it to 'lines'
                    self.lines.append( line )
                else:
                    # humm.. this is an unexpected block end,
                    # create a new block, but don't process the line
                    self.add_block_lines()

                    # we need to process the line again
                    self.process_normal_line( line )

        # record the last lines
        self.add_block_lines()

    def  process_normal_line( self, line ):
        """process a normal line and check whether it is the start of a new block"""
        for f in re_source_block_formats:
            if f.start.match( line ):
                self.add_block_lines()
                self.format = f
                self.lineno = fileinput.filelineno()

        self.lines.append( line )

    def  add_block_lines( self ):
        """add the current accumulated lines and create a new block"""
        if self.lines != []:
            block = SourceBlock( self, self.filename, self.lineno, self.lines )

            self.blocks.append( block )
            self.format = None
            self.lines  = []

    # debugging only, not used in normal operations
    def  dump( self ):
        """print all blocks in a processor"""
        for b in self.blocks:
            b.dump()

# eof