1 /* myhtmlparse.h: subclass of HtmlParser for extracting text 2 * 3 * Copyright 1999,2000,2001 BrightStation PLC 4 * Copyright 2002,2003,2004,2006,2008,2010,2011,2012 Olly Betts 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License as 8 * published by the Free Software Foundation; either version 2 of the 9 * License, or (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 19 * USA 20 */ 21 22 #ifndef OMEGA_INCLUDED_MYHTMLPARSE_H 23 #define OMEGA_INCLUDED_MYHTMLPARSE_H 24 25 #include "htmlparse.h" 26 27 // FIXME: Should we include \xa0 which is non-breaking space in iso-8859-1, but 28 // not in all charsets and perhaps spans of all \xa0 should become a single 29 // \xa0? 30 #define WHITESPACE " \t\n\r" 31 32 class MyHtmlParser : public HtmlParser { 33 public: 34 bool in_script_tag; 35 bool in_style_tag; 36 bool pending_space; 37 bool indexing_allowed; 38 bool ignoring_metarobots; 39 bool charset_from_meta; 40 string title, sample, keywords, dump, author; 41 string * target; 42 43 void process_text(const string &text); 44 bool opening_tag(const string &tag); 45 bool closing_tag(const string &tag); 46 void parse_html(const string &text, const string &charset_, 47 bool charset_from_meta_); ignore_metarobots()48 void ignore_metarobots() { ignoring_metarobots = true; } MyHtmlParser()49 MyHtmlParser() : 50 in_script_tag(false), 51 in_style_tag(false), 52 pending_space(false), 53 indexing_allowed(true), 54 ignoring_metarobots(false), 55 charset_from_meta(false), 56 target(&dump) { } 57 reset()58 void reset() { 59 in_script_tag = false; 60 in_style_tag = false; 61 pending_space = false; 62 indexing_allowed = true; 63 ignoring_metarobots = false; 64 charset_from_meta = false; 65 title.resize(0); 66 sample.resize(0); 67 keywords.resize(0); 68 dump.resize(0); 69 author.resize(0); 70 target = &dump; 71 } 72 }; 73 74 #endif // OMEGA_INCLUDED_MYHTMLPARSE_H 75