1
2"""
3yaml.py
4
5Lexer for YAML, a human-friendly data serialization language
6(http://yaml.org/).
7
8Written by Kirill Simonov <xi@resolvent.net>.
9
10License: Whatever suitable for inclusion into the Pygments package.
11"""
12
13from pygments.lexer import  \
14        ExtendedRegexLexer, LexerContext, include, bygroups
15from pygments.token import  \
16        Text, Comment, Punctuation, Name, Literal
17
18__all__ = ['YAMLLexer']
19
20
21class YAMLLexerContext(LexerContext):
22    """Indentation context for the YAML lexer."""
23
24    def __init__(self, *args, **kwds):
25        super(YAMLLexerContext, self).__init__(*args, **kwds)
26        self.indent_stack = []
27        self.indent = -1
28        self.next_indent = 0
29        self.block_scalar_indent = None
30
31
32def something(TokenClass):
33    """Do not produce empty tokens."""
34    def callback(lexer, match, context):
35        text = match.group()
36        if not text:
37            return
38        yield match.start(), TokenClass, text
39        context.pos = match.end()
40    return callback
41
42def reset_indent(TokenClass):
43    """Reset the indentation levels."""
44    def callback(lexer, match, context):
45        text = match.group()
46        context.indent_stack = []
47        context.indent = -1
48        context.next_indent = 0
49        context.block_scalar_indent = None
50        yield match.start(), TokenClass, text
51        context.pos = match.end()
52    return callback
53
54def save_indent(TokenClass, start=False):
55    """Save a possible indentation level."""
56    def callback(lexer, match, context):
57        text = match.group()
58        extra = ''
59        if start:
60            context.next_indent = len(text)
61            if context.next_indent < context.indent:
62                while context.next_indent < context.indent:
63                    context.indent = context.indent_stack.pop()
64                if context.next_indent > context.indent:
65                    extra = text[context.indent:]
66                    text = text[:context.indent]
67        else:
68            context.next_indent += len(text)
69        if text:
70            yield match.start(), TokenClass, text
71        if extra:
72            yield match.start()+len(text), TokenClass.Error, extra
73        context.pos = match.end()
74    return callback
75
76def set_indent(TokenClass, implicit=False):
77    """Set the previously saved indentation level."""
78    def callback(lexer, match, context):
79        text = match.group()
80        if context.indent < context.next_indent:
81            context.indent_stack.append(context.indent)
82            context.indent = context.next_indent
83        if not implicit:
84            context.next_indent += len(text)
85        yield match.start(), TokenClass, text
86        context.pos = match.end()
87    return callback
88
89def set_block_scalar_indent(TokenClass):
90    """Set an explicit indentation level for a block scalar."""
91    def callback(lexer, match, context):
92        text = match.group()
93        context.block_scalar_indent = None
94        if not text:
95            return
96        increment = match.group(1)
97        if increment:
98            current_indent = max(context.indent, 0)
99            increment = int(increment)
100            context.block_scalar_indent = current_indent + increment
101        if text:
102            yield match.start(), TokenClass, text
103            context.pos = match.end()
104    return callback
105
106def parse_block_scalar_empty_line(IndentTokenClass, ContentTokenClass):
107    """Process an empty line in a block scalar."""
108    def callback(lexer, match, context):
109        text = match.group()
110        if (context.block_scalar_indent is None or
111                len(text) <= context.block_scalar_indent):
112            if text:
113                yield match.start(), IndentTokenClass, text
114        else:
115            indentation = text[:context.block_scalar_indent]
116            content = text[context.block_scalar_indent:]
117            yield match.start(), IndentTokenClass, indentation
118            yield (match.start()+context.block_scalar_indent,
119                    ContentTokenClass, content)
120        context.pos = match.end()
121    return callback
122
123def parse_block_scalar_indent(TokenClass):
124    """Process indentation spaces in a block scalar."""
125    def callback(lexer, match, context):
126        text = match.group()
127        if context.block_scalar_indent is None:
128            if len(text) <= max(context.indent, 0):
129                context.stack.pop()
130                context.stack.pop()
131                return
132            context.block_scalar_indent = len(text)
133        else:
134            if len(text) < context.block_scalar_indent:
135                context.stack.pop()
136                context.stack.pop()
137                return
138        if text:
139            yield match.start(), TokenClass, text
140            context.pos = match.end()
141    return callback
142
143def parse_plain_scalar_indent(TokenClass):
144    """Process indentation spaces in a plain scalar."""
145    def callback(lexer, match, context):
146        text = match.group()
147        if len(text) <= context.indent:
148            context.stack.pop()
149            context.stack.pop()
150            return
151        if text:
152            yield match.start(), TokenClass, text
153            context.pos = match.end()
154    return callback
155
156
157class YAMLLexer(ExtendedRegexLexer):
158    """Lexer for the YAML language."""
159
160    name = 'YAML'
161    aliases = ['yaml']
162    filenames = ['*.yaml', '*.yml']
163    mimetypes = ['text/x-yaml']
164
165    tokens = {
166
167        # the root rules
168        'root': [
169            # ignored whitespaces
170            (r'[ ]+(?=#|$)', Text.Blank),
171            # line breaks
172            (r'\n+', Text.Break),
173            # a comment
174            (r'#[^\n]*', Comment.Single),
175            # the '%YAML' directive
176            (r'^%YAML(?=[ ]|$)', reset_indent(Name.Directive),
177                'yaml-directive'),
178            # the %TAG directive
179            (r'^%TAG(?=[ ]|$)', reset_indent(Name.Directive),
180                'tag-directive'),
181            # document start and document end indicators
182            (r'^(?:---|\.\.\.)(?=[ ]|$)',
183                reset_indent(Punctuation.Document), 'block-line'),
184            # indentation spaces
185            (r'[ ]*(?![ \t\n\r\f\v]|$)',
186                save_indent(Text.Indent, start=True),
187                ('block-line', 'indentation')),
188        ],
189
190        # trailing whitespaces after directives or a block scalar indicator
191        'ignored-line': [
192            # ignored whitespaces
193            (r'[ ]+(?=#|$)', Text.Blank),
194            # a comment
195            (r'#[^\n]*', Comment.Single),
196            # line break
197            (r'\n', Text.Break, '#pop:2'),
198        ],
199
200        # the %YAML directive
201        'yaml-directive': [
202            # the version number
203            (r'([ ]+)([0-9]+\.[0-9]+)',
204                bygroups(Text.Blank, Literal.Version), 'ignored-line'),
205        ],
206
207        # the %YAG directive
208        'tag-directive': [
209            # a tag handle and the corresponding prefix
210            (r'([ ]+)(!|![0-9A-Za-z_-]*!)'
211                r'([ ]+)(!|!?[0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+)',
212                bygroups(Text.Blank, Name.Type, Text.Blank, Name.Type),
213                'ignored-line'),
214        ],
215
216        # block scalar indicators and indentation spaces
217        'indentation': [
218            # trailing whitespaces are ignored
219            (r'[ ]*$', something(Text.Blank), '#pop:2'),
220            # whitespaces preceding block collection indicators
221            (r'[ ]+(?=[?:-](?:[ ]|$))', save_indent(Text.Indent)),
222            # block collection indicators
223            (r'[?:-](?=[ ]|$)', set_indent(Punctuation.Indicator)),
224            # the beginning a block line
225            (r'[ ]*', save_indent(Text.Indent), '#pop'),
226        ],
227
228        # an indented line in the block context
229        'block-line': [
230            # the line end
231            (r'[ ]*(?=#|$)', something(Text.Blank), '#pop'),
232            # whitespaces separating tokens
233            (r'[ ]+', Text.Blank),
234            # tags, anchors and aliases,
235            include('descriptors'),
236            # block collections and scalars
237            include('block-nodes'),
238            # flow collections and quoted scalars
239            include('flow-nodes'),
240            # a plain scalar
241            (r'(?=[^ \t\n\r\f\v?:,\[\]{}#&*!|>\'"%@`-]|[?:-][^ \t\n\r\f\v])',
242                something(Literal.Scalar.Plain),
243                'plain-scalar-in-block-context'),
244        ],
245
246        # tags, anchors, aliases
247        'descriptors' : [
248            # a full-form tag
249            (r'!<[0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+>', Name.Type),
250            # a tag in the form '!', '!suffix' or '!handle!suffix'
251            (r'!(?:[0-9A-Za-z_-]+)?'
252                r'(?:![0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+)?', Name.Type),
253            # an anchor
254            (r'&[0-9A-Za-z_-]+', Name.Anchor),
255            # an alias
256            (r'\*[0-9A-Za-z_-]+', Name.Alias),
257        ],
258
259        # block collections and scalars
260        'block-nodes': [
261            # implicit key
262            (r':(?=[ ]|$)', set_indent(Punctuation.Indicator, implicit=True)),
263            # literal and folded scalars
264            (r'[|>]', Punctuation.Indicator,
265                ('block-scalar-content', 'block-scalar-header')),
266        ],
267
268        # flow collections and quoted scalars
269        'flow-nodes': [
270            # a flow sequence
271            (r'\[', Punctuation.Indicator, 'flow-sequence'),
272            # a flow mapping
273            (r'\{', Punctuation.Indicator, 'flow-mapping'),
274            # a single-quoted scalar
275            (r'\'', Literal.Scalar.Flow.Quote, 'single-quoted-scalar'),
276            # a double-quoted scalar
277            (r'\"', Literal.Scalar.Flow.Quote, 'double-quoted-scalar'),
278        ],
279
280        # the content of a flow collection
281        'flow-collection': [
282            # whitespaces
283            (r'[ ]+', Text.Blank),
284            # line breaks
285            (r'\n+', Text.Break),
286            # a comment
287            (r'#[^\n]*', Comment.Single),
288            # simple indicators
289            (r'[?:,]', Punctuation.Indicator),
290            # tags, anchors and aliases
291            include('descriptors'),
292            # nested collections and quoted scalars
293            include('flow-nodes'),
294            # a plain scalar
295            (r'(?=[^ \t\n\r\f\v?:,\[\]{}#&*!|>\'"%@`])',
296                something(Literal.Scalar.Plain),
297                'plain-scalar-in-flow-context'),
298        ],
299
300        # a flow sequence indicated by '[' and ']'
301        'flow-sequence': [
302            # include flow collection rules
303            include('flow-collection'),
304            # the closing indicator
305            (r'\]', Punctuation.Indicator, '#pop'),
306        ],
307
308        # a flow mapping indicated by '{' and '}'
309        'flow-mapping': [
310            # include flow collection rules
311            include('flow-collection'),
312            # the closing indicator
313            (r'\}', Punctuation.Indicator, '#pop'),
314        ],
315
316        # block scalar lines
317        'block-scalar-content': [
318            # line break
319            (r'\n', Text.Break),
320            # empty line
321            (r'^[ ]+$',
322                parse_block_scalar_empty_line(Text.Indent,
323                    Literal.Scalar.Block)),
324            # indentation spaces (we may leave the state here)
325            (r'^[ ]*', parse_block_scalar_indent(Text.Indent)),
326            # line content
327            (r'[^\n\r\f\v]+', Literal.Scalar.Block),
328        ],
329
330        # the content of a literal or folded scalar
331        'block-scalar-header': [
332            # indentation indicator followed by chomping flag
333            (r'([1-9])?[+-]?(?=[ ]|$)',
334                set_block_scalar_indent(Punctuation.Indicator),
335                'ignored-line'),
336            # chomping flag followed by indentation indicator
337            (r'[+-]?([1-9])?(?=[ ]|$)',
338                set_block_scalar_indent(Punctuation.Indicator),
339                'ignored-line'),
340        ],
341
342        # ignored and regular whitespaces in quoted scalars
343        'quoted-scalar-whitespaces': [
344            # leading and trailing whitespaces are ignored
345            (r'^[ ]+|[ ]+$', Text.Blank),
346            # line breaks are ignored
347            (r'\n+', Text.Break),
348            # other whitespaces are a part of the value
349            (r'[ ]+', Literal.Scalar.Flow),
350        ],
351
352        # single-quoted scalars
353        'single-quoted-scalar': [
354            # include whitespace and line break rules
355            include('quoted-scalar-whitespaces'),
356            # escaping of the quote character
357            (r'\'\'', Literal.Scalar.Flow.Escape),
358            # regular non-whitespace characters
359            (r'[^ \t\n\r\f\v\']+', Literal.Scalar.Flow),
360            # the closing quote
361            (r'\'', Literal.Scalar.Flow.Quote, '#pop'),
362        ],
363
364        # double-quoted scalars
365        'double-quoted-scalar': [
366            # include whitespace and line break rules
367            include('quoted-scalar-whitespaces'),
368            # escaping of special characters
369            (r'\\[0abt\tn\nvfre "\\N_LP]', Literal.Scalar.Flow.Escape),
370            # escape codes
371            (r'\\(?:x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})',
372                Literal.Scalar.Flow.Escape),
373            # regular non-whitespace characters
374            (r'[^ \t\n\r\f\v\"\\]+', Literal.Scalar.Flow),
375            # the closing quote
376            (r'"', Literal.Scalar.Flow.Quote, '#pop'),
377        ],
378
379        # the beginning of a new line while scanning a plain scalar
380        'plain-scalar-in-block-context-new-line': [
381            # empty lines
382            (r'^[ ]+$', Text.Blank),
383            # line breaks
384            (r'\n+', Text.Break),
385            # document start and document end indicators
386            (r'^(?=---|\.\.\.)', something(Punctuation.Document), '#pop:3'),
387            # indentation spaces (we may leave the block line state here)
388            (r'^[ ]*', parse_plain_scalar_indent(Text.Indent), '#pop'),
389        ],
390
391        # a plain scalar in the block context
392        'plain-scalar-in-block-context': [
393            # the scalar ends with the ':' indicator
394            (r'[ ]*(?=:[ ]|:$)', something(Text.Blank), '#pop'),
395            # the scalar ends with whitespaces followed by a comment
396            (r'[ ]+(?=#)', Text.Blank, '#pop'),
397            # trailing whitespaces are ignored
398            (r'[ ]+$', Text.Blank),
399            # line breaks are ignored
400            (r'\n+', Text.Break, 'plain-scalar-in-block-context-new-line'),
401            # other whitespaces are a part of the value
402            (r'[ ]+', Literal.Scalar.Plain),
403            # regular non-whitespace characters
404            (r'(?::(?![ \t\n\r\f\v])|[^ \t\n\r\f\v:])+',
405                Literal.Scalar.Plain),
406        ],
407
408        # a plain scalar is the flow context
409        'plain-scalar-in-flow-context': [
410            # the scalar ends with an indicator character
411            (r'[ ]*(?=[,:?\[\]{}])', something(Text.Blank), '#pop'),
412            # the scalar ends with a comment
413            (r'[ ]+(?=#)', Text.Blank, '#pop'),
414            # leading and trailing whitespaces are ignored
415            (r'^[ ]+|[ ]+$', Text.Blank),
416            # line breaks are ignored
417            (r'\n+', Text.Break),
418            # other whitespaces are a part of the value
419            (r'[ ]+', Literal.Scalar.Plain),
420            # regular non-whitespace characters
421            (r'[^ \t\n\r\f\v,:?\[\]{}]+', Literal.Scalar.Plain),
422        ],
423
424    }
425
426    def get_tokens_unprocessed(self, text=None, context=None):
427        if context is None:
428            context = YAMLLexerContext(text, 0)
429        return super(YAMLLexer, self).get_tokens_unprocessed(text, context)
430
431
432