1 /* myhtmlparse.h: subclass of HtmlParser for extracting text
2  *
3  * Copyright 1999,2000,2001 BrightStation PLC
4  * Copyright 2002,2003,2004,2006,2008,2010,2011,2012 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
19  * USA
20  */
21 
22 #ifndef OMEGA_INCLUDED_MYHTMLPARSE_H
23 #define OMEGA_INCLUDED_MYHTMLPARSE_H
24 
25 #include "htmlparse.h"
26 
27 // FIXME: Should we include \xa0 which is non-breaking space in iso-8859-1, but
28 // not in all charsets and perhaps spans of all \xa0 should become a single
29 // \xa0?
30 #define WHITESPACE " \t\n\r"
31 
32 class MyHtmlParser : public HtmlParser {
33     public:
34 	bool in_script_tag;
35 	bool in_style_tag;
36 	bool pending_space;
37 	bool indexing_allowed;
38 	bool ignoring_metarobots;
39 	bool charset_from_meta;
40 	string title, sample, keywords, dump, author;
41 	string * target;
42 
43 	void process_text(const string &text);
44 	bool opening_tag(const string &tag);
45 	bool closing_tag(const string &tag);
46 	void parse_html(const string &text, const string &charset_,
47 			bool charset_from_meta_);
ignore_metarobots()48 	void ignore_metarobots() { ignoring_metarobots = true; }
MyHtmlParser()49 	MyHtmlParser() :
50 		in_script_tag(false),
51 		in_style_tag(false),
52 		pending_space(false),
53 		indexing_allowed(true),
54 		ignoring_metarobots(false),
55 		charset_from_meta(false),
56 		target(&dump) { }
57 
reset()58 	void reset() {
59 	    in_script_tag = false;
60 	    in_style_tag = false;
61 	    pending_space = false;
62 	    indexing_allowed = true;
63 	    ignoring_metarobots = false;
64 	    charset_from_meta = false;
65 	    title.resize(0);
66 	    sample.resize(0);
67 	    keywords.resize(0);
68 	    dump.resize(0);
69 	    author.resize(0);
70 	    target = &dump;
71 	}
72 };
73 
74 #endif // OMEGA_INCLUDED_MYHTMLPARSE_H
75