1 // A lexer to print out all XML tags in a file.
2 // Uses lazy quantifiers for compact expressions.
3 // Limitations: does not check UTF-8 encoding validity, cannot handle DTDs.
4 
5   #include <stdio.h>
6   int level = 0;
7 
8 %o dotall main
9 
10 name                    [A-Za-z_:\x80-\xFF][-.0-9A-Za-z_:\x80-\xFF]*
11 pi                      <\?{name}
12 comment                 <!--.*?-->
13 open                    <{name}
14 close                   <\/{name}>
15 cdata                   <!\[CDATA\[.*?]]>
16 string                  \".*?\"|'.*?'
17 
18 %x ATTRIBUTES
19 
20 %%
21 
22 {comment}               |
23 {cdata}                 /* skip comments and CDATA sections */
24 
25 {pi}                    start(ATTRIBUTES);
26 
27 {open}                  printf("%*s%s\n", level++, "", text() + 1);
28                         start(ATTRIBUTES);
29 
30 {close}                 matcher().less(size() - 1);
31                         printf("%*s%s\n", --level, "", text() + 2);
32 
33 <<EOF>>                 printf("Tags are %sbalanced\n", level ? "im" : "");
34                         return 0;
35 
36 <ATTRIBUTES>"/>"        --level;
37                         start(INITIAL);
38 
39 <ATTRIBUTES>"?>"        |
40 <ATTRIBUTES>">"         start(INITIAL);
41 
42 <ATTRIBUTES>{name}      |
43 <ATTRIBUTES>{string}    /* skip attribute names and strings */
44 
45 <*>.                    /* skip anything else */
46 
47 %%
48