1 /* Boost.Flyweight example of flyweight-based formatted text processing.
2  *
3  * Copyright 2006-2014 Joaquin M Lopez Munoz.
4  * Distributed under the Boost Software License, Version 1.0.
5  * (See accompanying file LICENSE_1_0.txt or copy at
6  * http://www.boost.org/LICENSE_1_0.txt)
7  *
8  * See http://www.boost.org/libs/flyweight for library home page.
9  */
10 
11 #include <boost/flyweight.hpp>
12 #include <boost/functional/hash.hpp>
13 #include <algorithm>
14 #include <cctype>
15 #include <cstdio>
16 #include <fstream>
17 #include <iostream>
18 #include <iterator>
19 #include <sstream>
20 #include <string>
21 #include <vector>
22 
23 #if defined(BOOST_NO_STDC_NAMESPACE)
24 namespace std{using ::exit;using ::tolower;}
25 #endif
26 
27 using namespace boost::flyweights;
28 
29 /* An HTML tag consists of a name and optional properties of the form
30  * name1=value1 ... namen=valuen. We do not need to parse the properties
31  * for the purposes of the program, hence they are all stored in
32  * html_tag_data::properties in raw form.
33  */
34 
35 struct html_tag_data
36 {
37   std::string name;
38   std::string properties;
39 };
40 
operator ==(const html_tag_data & x,const html_tag_data & y)41 bool operator==(const html_tag_data& x,const html_tag_data& y)
42 {
43   return x.name==y.name&&x.properties==y.properties;
44 }
45 
46 /* See the portability section of Boost.Hash at
47  *   http://boost.org/doc/html/hash/portability.html
48  * for an explanation of the ADL-related workarounds.
49  */
50 
51 #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
52 namespace boost{
53 #endif
54 
hash_value(const html_tag_data & x)55 std::size_t hash_value(const html_tag_data& x)
56 {
57   std::size_t res=0;
58   boost::hash_combine(res,x.name);
59   boost::hash_combine(res,x.properties);
60   return res;
61 }
62 
63 #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
64 } /* namespace boost */
65 #endif
66 
67 typedef flyweight<html_tag_data> html_tag;
68 
69 /* parse_tag is passed an iterator positioned at the first char of
70  * the tag after the opening '<' and returns, if succesful, a parsed tag
71  * and whether it is opening (<xx>) or closing (</xx>).
72  */
73 
74 enum tag_type{opening,closing,failure};
75 
76 struct parse_tag_res
77 {
parse_tag_resparse_tag_res78   parse_tag_res(tag_type type_,const html_tag_data& tag_=html_tag_data()):
79     type(type_),tag(tag_){}
parse_tag_resparse_tag_res80   parse_tag_res(const parse_tag_res& x):type(x.type),tag(x.tag){}
81 
82   tag_type type;
83   html_tag tag;
84 };
85 
86 template<typename ForwardIterator>
parse_tag(ForwardIterator & first,ForwardIterator last)87 parse_tag_res parse_tag(ForwardIterator& first,ForwardIterator last)
88 {
89   html_tag_data  tag;
90   std::string    buf;
91   bool           in_quote=false;
92   for(ForwardIterator it=first;it!=last;){
93     char ch=*it++;
94     if(ch=='>'&&!in_quote){             /* ignore '>'s if inside quotes */
95       tag_type type;
96       std::string::size_type
97         bname=buf.find_first_not_of("\t\n\r "),
98         ename=bname==std::string::npos?
99           std::string::npos:
100           buf.find_first_of("\t\n\r ",bname),
101         bprop=ename==std::string::npos?
102           std::string::npos:
103           buf.find_first_not_of("\t\n\r ",ename);
104       if(bname==ename){                 /* null name */
105         return parse_tag_res(failure);
106       }
107       else if(buf[bname]=='/'){         /* closing tag */
108         type=closing;
109         ++bname;
110       }
111       else type=opening;
112       tag.name=buf.substr(bname,ename-bname);
113       std::transform(                   /* normalize tag name to lower case */
114         tag.name.begin(),tag.name.end(),tag.name.begin(),
115         (int(*)(int))std::tolower);
116       if(bprop!=std::string::npos){
117         tag.properties=buf.substr(bprop,buf.size());
118       }
119       first=it;                         /* result good, consume the chars */
120       return parse_tag_res(type,tag);
121     }
122     else{
123       if(ch=='"')in_quote=!in_quote;
124       buf+=ch;
125     }
126   }
127   return parse_tag_res(failure);        /* end reached and found no '>' */
128 }
129 
130 /* A character context is just a vector containing the tags enclosing the
131  * character, from the outermost level to the innermost.
132  */
133 
134 typedef std::vector<html_tag>        html_context_data;
135 typedef flyweight<html_context_data> html_context;
136 
137 /* A character is a char code plus its context.
138  */
139 
140 struct character_data
141 {
character_datacharacter_data142   character_data(char code_=0,html_context context_=html_context()):
143     code(code_),context(context_){}
character_datacharacter_data144   character_data(const character_data& x):code(x.code),context(x.context){}
145 
146   char         code;
147   html_context context;
148 };
149 
operator ==(const character_data & x,const character_data & y)150 bool operator==(const character_data& x,const character_data& y)
151 {
152   return x.code==y.code&&x.context==y.context;
153 }
154 
155 #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
156 namespace boost{
157 #endif
158 
hash_value(const character_data & x)159 std::size_t hash_value(const character_data& x)
160 {
161   std::size_t res=0;
162   boost::hash_combine(res,x.code);
163   boost::hash_combine(res,x.context);
164   return res;
165 }
166 
167 #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
168 } /* namespace boost */
169 #endif
170 
171 typedef flyweight<character_data> character;
172 
173 /* scan_html converts HTML code into a stream of contextualized characters.
174  */
175 
176 template<typename ForwardIterator,typename OutputIterator>
scan_html(ForwardIterator first,ForwardIterator last,OutputIterator out)177 void scan_html(ForwardIterator first,ForwardIterator last,OutputIterator out)
178 {
179   html_context_data context;
180   while(first!=last){
181     if(*first=='<'){                                 /* tag found */
182       ++first;
183       parse_tag_res res=parse_tag(first,last);
184       if(res.type==opening){                         /* add to contex */
185         context.push_back(res.tag);
186         continue;
187       }
188       else if(res.type==closing){                    /* remove from context */
189         /* Pop all tags from the innermost to the matching one; this takes
190          * care of missing </xx>s like vg. in <ul><li>hello</ul>.
191          */
192 
193         for(html_context_data::reverse_iterator rit=context.rbegin();
194             rit!=context.rend();++rit){
195           if(rit->get().name==res.tag.get().name){
196             context.erase(rit.base()-1,context.end());
197             break;
198           }
199         }
200         continue;
201       }
202     }
203     *out++=character(*first++,html_context(context));
204   }
205 }
206 
207 /* HTML-producing utilities */
208 
print_opening_tag(std::ostream & os,const html_tag_data & x)209 void print_opening_tag(std::ostream& os,const html_tag_data& x)
210 {
211   os<<"<"<<x.name;
212   if(!x.properties.empty())os<<" "<<x.properties;
213   os<<">";
214 }
215 
print_closing_tag(std::ostream & os,const html_tag_data & x)216 void print_closing_tag(std::ostream& os,const html_tag_data& x)
217 {
218   /* SGML declarations (beginning with '!') are not closed */
219 
220   if(x.name[0]!='!')os<<"</"<<x.name<<">";
221 }
222 
223 /* change_context takes contexts from and to with tags
224  *
225  *   from<- c1 ... cn fn+1 ... fm
226  *   to  <- c1 ... cn tn+1 ... tk
227  *
228  * (that is, they share the first n tags, n might be 0), and
229  * produces code closing fm ... fn+1 and opening tn+1 ... tk.
230  */
231 
232 template<typename OutputIterator>
change_context(const html_context_data & from,const html_context_data & to,OutputIterator out)233 void change_context(
234   const html_context_data& from,const html_context_data& to,
235   OutputIterator out)
236 {
237   std::ostringstream oss;
238   html_context_data::const_iterator
239     it0=from.begin(),
240     it0_end=from.end(),
241     it1=to.begin(),
242     it1_end=to.end();
243   for(;it0!=it0_end&&it1!=it1_end&&*it0==*it1;++it0,++it1);
244   while(it0_end!=it0)print_closing_tag(oss,*--it0_end);
245   while(it1!=it1_end)print_opening_tag(oss,*it1++);
246   std::string str=oss.str();
247   std::copy(str.begin(),str.end(),out);
248 }
249 
250 /* produce_html is passed a bunch of contextualized characters and emits
251  * the corresponding HTML. The algorithm is simple: tags are opened and closed
252  * as a result of the context from one character to the following changing.
253  */
254 
255 template<typename ForwardIterator,typename OutputIterator>
produce_html(ForwardIterator first,ForwardIterator last,OutputIterator out)256 void produce_html(ForwardIterator first,ForwardIterator last,OutputIterator out)
257 {
258   html_context context;
259   while(first!=last){
260     if(first->get().context!=context){
261       change_context(context,first->get().context,out);
262       context=first->get().context;
263     }
264     *out++=(first++)->get().code;
265   }
266   change_context(context,html_context(),out); /* close remaining context */
267 }
268 
269 /* Without these explicit instantiations, MSVC++ 6.5/7.0 does not
270  * find some friend operators in certain contexts.
271  */
272 
273 character dummy1;
274 html_tag  dummy2;
275 
main()276 int main()
277 {
278   std::cout<<"input html file: ";
279   std::string in;
280   std::getline(std::cin,in);
281   std::ifstream ifs(in.c_str());
282   if(!ifs){
283     std::cout<<"can't open "<<in<<std::endl;
284     std::exit(EXIT_FAILURE);
285   }
286   typedef std::istreambuf_iterator<char> istrbuf_iterator;
287   std::vector<char> html_source;
288   std::copy(
289     istrbuf_iterator(ifs),istrbuf_iterator(),
290     std::back_inserter(html_source));
291 
292   /* parse the HTML */
293 
294   std::vector<character> scanned_html;
295   scan_html(
296     html_source.begin(),html_source.end(),std::back_inserter(scanned_html));
297 
298   /* Now that we have the text as a vector of contextualized characters,
299    * we can shuffle it around and manipulate in almost any way we please.
300    * For instance, the following reverses the central portion of the doc.
301    */
302 
303   std::reverse(
304     scanned_html.begin()+scanned_html.size()/4,
305     scanned_html.begin()+3*(scanned_html.size()/4));
306 
307   /* emit the resulting HTML */
308 
309   std::cout<<"output html file: ";
310   std::string out;
311   std::getline(std::cin,out);
312   std::ofstream ofs(out.c_str());
313   if(!ofs){
314     std::cout<<"can't open "<<out<<std::endl;
315     std::exit(EXIT_FAILURE);
316   }
317   typedef std::ostreambuf_iterator<char> ostrbuf_iterator;
318   produce_html(scanned_html.begin(),scanned_html.end(),ostrbuf_iterator(ofs));
319 
320   return 0;
321 }
322