1 /*=============================================================================
2     Copyright (c) 2005 2006 Joel de Guzman
3     http://spirit.sourceforge.net/
4 
5     Use, modification and distribution is subject to the Boost Software
6     License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
7     http://www.boost.org/LICENSE_1_0.txt)
8 =============================================================================*/
9 #include "post_process.hpp"
10 #include <cctype>
11 #include <set>
12 #include <stack>
13 #include <boost/bind.hpp>
14 #include <boost/spirit/include/classic_core.hpp>
15 #include <boost/spirit/include/phoenix1_operators.hpp>
16 #include <boost/spirit/include/phoenix1_primitives.hpp>
17 
18 namespace quickbook
19 {
20     namespace cl = boost::spirit::classic;
21     namespace ph = phoenix;
22     typedef std::string::const_iterator iter_type;
23 
24     struct pretty_printer
25     {
pretty_printerquickbook::pretty_printer26         pretty_printer(std::string& out_, int& current_indent_, int linewidth_)
27             : prev(0)
28             , out(out_)
29             , current_indent(current_indent_)
30             , column(0)
31             , in_string(false)
32             , linewidth(linewidth_)
33         {
34         }
35 
indentquickbook::pretty_printer36         void indent()
37         {
38             BOOST_ASSERT(current_indent >= 0); // this should not happen!
39             for (int i = 0; i < current_indent; ++i)
40                 out += ' ';
41             column = current_indent;
42         }
43 
trim_spacesquickbook::pretty_printer44         void trim_spaces()
45         {
46             out.erase(out.find_last_not_of(' ') + 1); // trim trailing spaces
47         }
48 
break_linequickbook::pretty_printer49         void break_line()
50         {
51             trim_spaces();
52             out += '\n';
53             indent();
54         }
55 
line_is_emptyquickbook::pretty_printer56         bool line_is_empty() const
57         {
58             for (iter_type i = out.end() - (column - current_indent);
59                  i != out.end(); ++i) {
60                 if (*i != ' ') return false;
61             }
62             return true;
63         }
64 
align_indentquickbook::pretty_printer65         void align_indent()
66         {
67             // make sure we are at the proper indent position
68             if (column != current_indent) {
69                 if (column > current_indent) {
70                     if (line_is_empty()) {
71                         // trim just enough trailing spaces down to
72                         // current_indent position
73                         out.erase(
74                             out.end() - (column - current_indent), out.end());
75                         column = current_indent;
76                     }
77                     else {
78                         // nope, line is not empty. do a hard CR
79                         break_line();
80                     }
81                 }
82                 else {
83                     // will this happen? (i.e. column <= current_indent)
84                     while (column != current_indent) {
85                         out += ' ';
86                         ++column;
87                     }
88                 }
89             }
90         }
91 
printquickbook::pretty_printer92         void print(char ch)
93         {
94             // Print a char. Attempt to break the line if we are exceeding
95             // the target linewidth. The linewidth is not an absolute limit.
96             // There are many cases where a line will exceed the linewidth
97             // and there is no way to properly break the line. Preformatted
98             // code that exceeds the linewidth are examples. We cannot break
99             // preformatted code. We shall not attempt to be very strict with
100             // line breaking. What's more important is to have a reproducable
101             // output (i.e. processing two logically equivalent xml files
102             // results in two lexically equivalent xml files). *** pretty
103             // formatting is a secondary goal ***
104 
105             // Strings will occur only in tag attributes. Normal content
106             // will have &quot; instead. We shall deal only with tag
107             // attributes here.
108             if (ch == '"') in_string = !in_string; // don't break strings!
109 
110             if (!in_string && std::isspace(static_cast<unsigned char>(ch))) {
111                 // we can break spaces if they are not inside strings
112                 if (!std::isspace(static_cast<unsigned char>(prev))) {
113                     if (column >= linewidth) {
114                         break_line();
115                         if (column == 0 && ch == ' ') {
116                             ++column;
117                             out += ' ';
118                         }
119                     }
120                     else {
121                         ++column;
122                         out += ' ';
123                     }
124                 }
125             }
126             else {
127                 // we can break tag boundaries and stuff after
128                 // delimiters if they are not inside strings
129                 // and *only-if* the preceding char is a space
130                 if (!in_string && column >= linewidth &&
131                     (ch == '<' &&
132                      std::isspace(static_cast<unsigned char>(prev))))
133                     break_line();
134                 out += ch;
135                 ++column;
136             }
137 
138             prev = ch;
139         }
140 
printquickbook::pretty_printer141         void print(iter_type f, iter_type l)
142         {
143             for (iter_type i = f; i != l; ++i)
144                 print(*i);
145         }
146 
print_tagquickbook::pretty_printer147         void print_tag(iter_type f, iter_type l, bool is_flow_tag)
148         {
149             if (is_flow_tag) {
150                 print(f, l);
151             }
152             else {
153                 // This is not a flow tag, so, we're going to do a
154                 // carriage return anyway. Let us remove extra right
155                 // spaces.
156                 std::string str(f, l);
157                 BOOST_ASSERT(f != l); // this should not happen
158                 iter_type i = str.end();
159                 while (i != str.begin() &&
160                        std::isspace(static_cast<unsigned char>(*(i - 1))))
161                     --i;
162                 print(str.begin(), i);
163             }
164         }
165 
166         char prev;
167         std::string& out;
168         int& current_indent;
169         int column;
170         bool in_string;
171         int linewidth;
172 
173       private:
174         pretty_printer& operator=(pretty_printer const&);
175     };
176 
177     char const* html_block_tags_[] = {
178         "div",   "p",    "blockquote", "address", "h1",       "h2",   "h3",
179         "h4",    "h5",   "h6",         "ul",      "ol",       "li",   "dl",
180         "dt",    "dd",   "table",      "tr",      "th",       "td",   "tbody",
181         "thead", "form", "fieldset",   "hr",      "noscript", "html", "body"};
182 
183     char const* block_tags_[] = {
184         "author",      "blockquote",    "bridgehead",   "callout",
185         "calloutlist", "caution",       "copyright",    "entry",
186         "important",   "informaltable", "itemizedlist", "legalnotice",
187         "listitem",    "note",          "orderedlist",  "para",
188         "row",         "section",       "simpara",      "table",
189         "tbody",       "textobject",    "tgroup",       "thead",
190         "tip",         "variablelist",  "varlistentry", "warning",
191         "xml",         "xi:include"};
192 
193     char const* doc_types_[] = {"book",     "article",   "library", "chapter",
194                                 "part",     "appendix",  "preface", "qandadiv",
195                                 "qandaset", "reference", "set"};
196 
197     struct tidy_compiler
198     {
tidy_compilerquickbook::tidy_compiler199         tidy_compiler(std::string& out_, int linewidth_, bool is_html)
200             : out(out_)
201             , current_indent(0)
202             , printer(out_, current_indent, linewidth_)
203         {
204             if (is_html) {
205                 static std::size_t const n_block_tags =
206                     sizeof(html_block_tags_) / sizeof(char const*);
207                 for (std::size_t i = 0; i != n_block_tags; ++i) {
208                     block_tags.insert(html_block_tags_[i]);
209                 }
210             }
211             else {
212                 static std::size_t const n_block_tags =
213                     sizeof(block_tags_) / sizeof(char const*);
214                 for (std::size_t i = 0; i != n_block_tags; ++i) {
215                     block_tags.insert(block_tags_[i]);
216                 }
217 
218                 static std::size_t const n_doc_types =
219                     sizeof(doc_types_) / sizeof(char const*);
220                 for (std::size_t i = 0; i != n_doc_types; ++i) {
221                     block_tags.insert(doc_types_[i]);
222                     block_tags.insert(doc_types_[i] + std::string("info"));
223                     block_tags.insert(doc_types_[i] + std::string("purpose"));
224                 }
225             }
226         }
227 
is_flow_tagquickbook::tidy_compiler228         bool is_flow_tag(std::string const& tag)
229         {
230             return block_tags.find(tag) == block_tags.end();
231         }
232 
233         std::set<std::string> block_tags;
234         std::stack<std::string> tags;
235         std::string& out;
236         int current_indent;
237         pretty_printer printer;
238         std::string current_tag;
239 
240       private:
241         tidy_compiler& operator=(tidy_compiler const&);
242     };
243 
244     struct tidy_grammar : cl::grammar<tidy_grammar>
245     {
tidy_grammarquickbook::tidy_grammar246         tidy_grammar(tidy_compiler& state_, int indent_, bool is_html_)
247             : state(state_), indent(indent_), is_html(is_html_)
248         {
249         }
250 
251         template <typename Scanner> struct definition
252         {
definitionquickbook::tidy_grammar::definition253             definition(tidy_grammar const& self)
254             {
255                 // clang-format off
256 
257                 tag = (cl::lexeme_d[+(cl::alnum_p | '_' | ':')])  [boost::bind(&tidy_grammar::do_tag, &self, _1, _2)];
258 
259                 code =  cl::eps_p(ph::var(self.is_html))
260                     >>  "<"
261                     >>  cl::lexeme_d[cl::str_p("pre")]
262                     >>  *(cl::anychar_p - '>')
263                     >>  ">"
264                     >>  *(cl::anychar_p - "</pre>")
265                     >>  "</pre"
266                     >>  cl::lexeme_d[">" >> *cl::space_p]
267                     |   cl::eps_p(!ph::var(self.is_html))
268                     >>   "<programlisting>"
269                     >>  *(cl::anychar_p - "</programlisting>")
270                     >>  "</programlisting"
271                     >>  cl::lexeme_d[">" >> *cl::space_p]
272                     ;
273 
274                 // What's the business of cl::lexeme_d['>' >> *cl::space_p]; ?
275                 // It is there to preserve the space after the tag that is
276                 // otherwise consumed by the cl::space_p skipper.
277 
278                 escape =
279                     cl::str_p("<!--quickbook-escape-prefix-->") >>
280                     (*(cl::anychar_p - cl::str_p("<!--quickbook-escape-postfix-->")))
281                     [
282                         boost::bind(&tidy_grammar::do_escape, &self, _1, _2)
283                     ]
284                     >>  cl::lexeme_d
285                         [
286                             cl::str_p("<!--quickbook-escape-postfix-->") >>
287                             (*cl::space_p)
288                             [
289                                 boost::bind(&tidy_grammar::do_escape_post, &self, _1, _2)
290                             ]
291                         ]
292                     ;
293 
294                 start_tag = '<' >> tag >> *(cl::anychar_p - '>') >> cl::lexeme_d['>' >> *cl::space_p];
295                 start_end_tag =
296                         '<' >> tag >> *(cl::anychar_p - ("/>" | cl::ch_p('>'))) >> cl::lexeme_d["/>" >> *cl::space_p]
297                     |   "<?" >> tag >> *(cl::anychar_p - '?') >> cl::lexeme_d["?>" >> *cl::space_p]
298                     |   "<!--" >> *(cl::anychar_p - "-->") >> cl::lexeme_d["-->" >> *cl::space_p]
299                     |   "<!" >> tag >> *(cl::anychar_p - '>') >> cl::lexeme_d['>' >> *cl::space_p]
300                     ;
301                 content = cl::lexeme_d[ +(cl::anychar_p - '<') ];
302                 end_tag = "</" >> +(cl::anychar_p - '>') >> cl::lexeme_d['>' >> *cl::space_p];
303 
304                 markup =
305                         escape
306                     |   code            [boost::bind(&tidy_grammar::do_code, &self, _1, _2)]
307                     |   start_end_tag   [boost::bind(&tidy_grammar::do_start_end_tag, &self, _1, _2)]
308                     |   start_tag       [boost::bind(&tidy_grammar::do_start_tag, &self, _1, _2)]
309                     |   end_tag         [boost::bind(&tidy_grammar::do_end_tag, &self, _1, _2)]
310                     |   content         [boost::bind(&tidy_grammar::do_content, &self, _1, _2)]
311                     ;
312 
313                 tidy = *markup;
314 
315                 // clang-format on
316             }
317 
startquickbook::tidy_grammar::definition318             cl::rule<Scanner> const& start() { return tidy; }
319 
320             cl::rule<Scanner> tidy, tag, start_tag, start_end_tag, content,
321                 end_tag, markup, code, escape;
322         };
323 
do_escape_postquickbook::tidy_grammar324         void do_escape_post(iter_type f, iter_type l) const
325         {
326             for (iter_type i = f; i != l; ++i)
327                 state.out += *i;
328         }
329 
do_escapequickbook::tidy_grammar330         void do_escape(iter_type f, iter_type l) const
331         {
332             while (f != l && std::isspace(*f)) {
333                 ++f;
334             }
335             while (f != l && std::isspace(*(l - 1))) {
336                 --l;
337             }
338             for (iter_type i = f; i != l; ++i) {
339                 state.out += *i;
340             }
341         }
342 
do_codequickbook::tidy_grammar343         void do_code(iter_type f, iter_type l) const
344         {
345             state.printer.trim_spaces();
346             if (state.out[state.out.size() - 1] != '\n') state.out += '\n';
347 
348             // trim trailing space from after closing tag
349             while (f != l && std::isspace(*(l - 1))) {
350                 --l;
351             }
352 
353             // print the string taking care of line
354             // ending CR/LF platform issues
355             for (iter_type i = f; i != l;) {
356                 if (*i == '\n') {
357                     state.printer.trim_spaces();
358                     state.out += '\n';
359                     ++i;
360                     if (i != l && *i == '\r') {
361                         ++i;
362                     }
363                 }
364                 else if (*i == '\r') {
365                     state.printer.trim_spaces();
366                     state.out += '\n';
367                     ++i;
368                     if (i != l && *i == '\n') {
369                         ++i;
370                     }
371                 }
372                 else {
373                     state.out += *i;
374                     ++i;
375                 }
376             }
377             state.out += '\n';
378             state.printer.indent();
379         }
380 
do_tagquickbook::tidy_grammar381         void do_tag(iter_type f, iter_type l) const
382         {
383             state.current_tag = std::string(f, l);
384         }
385 
do_start_end_tagquickbook::tidy_grammar386         void do_start_end_tag(iter_type f, iter_type l) const
387         {
388             bool is_flow_tag = state.is_flow_tag(state.current_tag);
389             if (!is_flow_tag) state.printer.align_indent();
390             state.printer.print_tag(f, l, is_flow_tag);
391             if (!is_flow_tag) state.printer.break_line();
392         }
393 
do_start_tagquickbook::tidy_grammar394         void do_start_tag(iter_type f, iter_type l) const
395         {
396             state.tags.push(state.current_tag);
397             bool is_flow_tag = state.is_flow_tag(state.current_tag);
398             if (!is_flow_tag) state.printer.align_indent();
399             state.printer.print_tag(f, l, is_flow_tag);
400             if (!is_flow_tag) {
401                 state.current_indent += indent;
402                 state.printer.break_line();
403             }
404         }
405 
do_contentquickbook::tidy_grammar406         void do_content(iter_type f, iter_type l) const
407         {
408             state.printer.print(f, l);
409         }
410 
do_end_tagquickbook::tidy_grammar411         void do_end_tag(iter_type f, iter_type l) const
412         {
413             if (state.tags.empty())
414                 throw quickbook::post_process_failure("Mismatched tags.");
415 
416             bool is_flow_tag = state.is_flow_tag(state.tags.top());
417             if (!is_flow_tag) {
418                 state.current_indent -= indent;
419                 state.printer.align_indent();
420             }
421             state.printer.print_tag(f, l, is_flow_tag);
422             if (!is_flow_tag) state.printer.break_line();
423             state.tags.pop();
424         }
425 
426         tidy_compiler& state;
427         int indent;
428         bool is_html;
429 
430       private:
431         tidy_grammar& operator=(tidy_grammar const&);
432     };
433 
post_process(std::string const & in,int indent,int linewidth,bool is_html)434     std::string post_process(
435         std::string const& in, int indent, int linewidth, bool is_html)
436     {
437         if (indent == -1) indent = 2;        // set default to 2
438         if (linewidth == -1) linewidth = 80; // set default to 80
439 
440         std::string tidy;
441         tidy_compiler state(tidy, linewidth, is_html);
442         tidy_grammar g(state, indent, is_html);
443         cl::parse_info<iter_type> r =
444             parse(in.begin(), in.end(), g, cl::space_p);
445         if (r.full) {
446             return tidy;
447         }
448         else {
449             throw quickbook::post_process_failure("Post Processing Failed.");
450         }
451     }
452 }
453