1 // This file is part of The New Aspell
2 // Copyright (C) 2004 by Tom Snyder
3 // Copyright (C) 2001-2004 by Kevin Atkinson under the GNU LGPL license
4 // version 2.0 or 2.1.  You should have received a copy of the LGPL
5 // license along with this library if you did not you can find
6 // it at http://www.gnu.org/.
7 //
8 // The original filter was written by Kevin Atkinson.
9 // Tom Snyder rewrote the filter to support skipping SGML tags
10 //
11 // This filter enables the spell checking of sgml, html, and xhtml files
12 // by skipping the <elements> and such.
13 // The overall strategy is based on http://www.w3.org/Library/SGML.c.
14 // We don't use that code (nor the sourceforge 'expat' project code) for
15 // simplicity's sake.  We don't need to fully parse all aspects of HTML -
16 // we just need to skip and handle a few aspects. The w3.org code had too
17 // many linkages into their overall SGML/HTML processing engine.
18 //
19 // See the comment at the end of this file for examples of what we handle.
20 // See the config setting docs regarding our config lists: check and skip.
22 #include <stdio.h> // needed for sprintf
24 #include "settings.h"
26 #include "asc_ctype.hpp"
27 #include "config.hpp"
28 #include "indiv_filter.hpp"
29 #include "string_map.hpp"
30 #include "mutable_container.hpp"
31 #include "clone_ptr-t.hpp"
32 #include "filter_char_vector.hpp"
34 //right now unused option
35 //  static const KeyInfo sgml_options[] = {
36 //    {"sgml-extension", KeyInfoList, "html,htm,php,sgml",
37 //     N_("sgml file extensions")}
38 //  };
40 namespace {
42   using namespace acommon;
44   class ToLowerMap : public StringMap
45   {
46   public:
add(ParmStr to_add)47     PosibErr<bool> add(ParmStr to_add) {
48       String new_key;
49       for (const char * i = to_add; *i; ++i) new_key += asc_tolower(*i);
50       return StringMap::add(new_key);
51     }
remove(ParmStr to_rem)53     PosibErr<bool> remove(ParmStr to_rem) {
54       String new_key;
55       for (const char * i = to_rem; *i; ++i) new_key += asc_tolower(*i);
56       return StringMap::remove(new_key);
57     }
58   };
60   class SgmlFilter : public IndividualFilter
61   {
62     // State enum. These states track where we are in the HTML/tag/element constructs.
63     // This diagram shows the main states. The marked number is the state we enter
64     // *after* we've read that char. Note that some of the state transitions handle
65     // illegal HTML such as <tag=what?>.
66     //
67     //  real text <tag attrib = this  attrib2='that'> &nbsp; </tag> &#123;
68     //   |        |   |  |   ||  |             |      |  |    |      |
69     //   1        2   3  4   56  7             8      10 11   9      12
70     enum ScanState {
71       S_text,   // 1. raw user text outside of any markup.
72       S_tag,    // 2. reading the 'tag' in <tag>
73       S_tag_gap,// 3. gap between attributes within an element:
74       S_attr,   // 4. Looking at an attrib name
75       S_attr_gap,// 5. optional gap after attrib name
76       S_equals,  // 6. Attrib equals sign, also space after the equals.
77       S_value,   // 7. In attrib value.
78       S_quoted,  // 8. In quoted attrib value.
79       S_end,     // 9. Same as S_tag, but this is a </zee> type end tag.
80       S_ignore_junk, // special case invalid area to ignore.
81       S_ero,     // 10. in the &code; special encoding within HTML.
82       S_entity,  // 11. in the alpha named &nom; special encoding..
83       S_cro,     // 12. after the # of a &#nnn; numerical char reference encoding.
85     // SGML.. etc can have these special "declarations" within them. We skip them
86     //  in a more raw manners since they don't abide by the attrib= rules.
87     // Most importantly, some of the attrib quoting rules don't apply.
88     //  <!ENTITY rtfchar "gg" - - (high, low)>  <!--  fully commented -->
89     //   |               |                        ||                   |
90     //   20              21                     23  24                 25
91     S_md,     // 20. In a declaration (or beginning a comment).
92     S_mdq,    // 21. Declaration in quotes - double or single quotes.
93     S_com_1,  // 23. perhaps a comment beginning.
94     S_com,    // 24. Fully in a comment
95     S_com_e,  // 25. Perhaps ending a comment.
97     //S_literal, // within a tag pair that means all content should be interpreted literally: <PRE>
100     //S_esc,S_dollar,S_paren,S_nonasciitext // WOULD BE USED FOR ISO_2022_JP support.
101                                           // NOT CURRENTLY SUPPORTED.
102     };
104     ScanState in_what;
105 	     // which quote char is quoting this attrib value.
107     FilterChar::Chr  quote_val;
108 	    // one char prior to this one. For escape handling and such.
109     FilterChar::Chr  lookbehind;
111     String tag_name;    // we accumulate the current tag name here.
112     String attrib_name; // we accumulate the current attribute name here.
114     bool include_attrib;  // are we in one of the attribs that *should* be spell checked (alt=..)
115     int  skipall;         // are we in one of the special skip-all content tags? This is treated
116 			  // as a bool and as a nesting level count.
117     String tag_endskip;  // tag name that will end that.
119     StringMap check_attribs; // list of attribs that we *should* spell check.
120     StringMap skip_tags;   // list of tags that start a no-check-at-all zone.
122     String which;
124     bool process_char(FilterChar::Chr c);
126   public:
SgmlFilter(const char * n)127 
128     SgmlFilter(const char * n) : which(n) {}
130     PosibErr<bool> setup(Config *);
131     void reset();
132     void process(FilterChar * &, FilterChar * &);
133   };
setup(Config * opts)134 
135   PosibErr<bool> SgmlFilter::setup(Config * opts)
136   {
137     name_ = which + "-filter";
138     order_num_ = 0.35;
139     check_attribs.clear();
140     skip_tags.clear();
141     opts->retrieve_list("f-" + which + "-skip",  &skip_tags);
142     opts->retrieve_list("f-" + which + "-check", &check_attribs);
143     reset();
144     return true;
145   }
147   void SgmlFilter::reset()
148   {
149     in_what = S_text;
150     quote_val = lookbehind = '\0';
151     skipall = 0;
152     include_attrib = false;
153   }
155   // yes this should be inlines, it is only called once
157   // RETURNS: TRUE if the caller should skip the passed char and
158   //  not do any spell check on it. FALSE if char is a part of the text
process_char(FilterChar::Chr c)159   //  of the document.
160   bool SgmlFilter::process_char(FilterChar::Chr c) {
162     bool retval = true;  // DEFAULT RETURN VALUE. All returns are done
163     			 // via retval and falling out the bottom. Except for
164     			 // one case that must manage the lookbehind char.
166     // PS: this switch will be fast since S_ABCs are an enum and
167     //  any good compiler will build a jump table for it.
168     // RE the gotos: Sometimes considered bad practice but that is
169     //  how the W3C code (1995) handles it. Could be done also with recursion
170     //  but I doubt that will clarify it. The gotos are done in cases where several
171     //  state changes occur on a single input char.
173     switch( in_what ) {
175       case S_text:   // 1. raw user text outside of any markup.
176 	   s_text:
177         switch( c ) {
178           case '&': in_what = S_ero;
179 		    break;
180           case '<': in_what = S_tag; tag_name.clear();
181 		    break;
182           default:
183                 retval = skipall;  // ********** RETVAL ASSIGNED
184         }			    // **************************
185         break;
187       case S_tag:    // 2. reading the 'tag' in <tag>
188       		     //  heads up: <foo/bar will be treated as an end tag. That's what w3c does.
189         switch( c ) {
190 	  case '>': goto all_end_tags;
191           case '/': in_what = S_end;
192 		    tag_name.clear();
193 		    break;
194           case '!': in_what = S_md;
195 		    break;
196           default: // either more alphanum of the tag, or end of tagname:
197             if( asc_isalpha(c) || asc_isdigit(c) ) {
198                 tag_name += asc_tolower(c);
199             }
200             else {  // End of the tag:
201                 in_what = S_tag_gap;
202 		goto s_tag_gap;  // handle content in that zone.
203 	    }
204 	}
205 	break;
207 	// '>'  '>'  '>'  '>'
208       all_end_tags:   // this gets called by several states to handle the
209 		       // possibility of a '>' ending a whole <tag...> guy.
210 	if( c != '>' ) break;
211 	in_what = S_text;
213 	if( lookbehind == '/' ) {
214 	    // Wowza: this is how we handle the <script stuff /> XML style self
215 	    //  terminating tag. By clearing the tagname out tag-skip-all code
216 	    //  will not be invoked.
217 	    tag_name.clear();
218 	}
220 	// Does this tag cause us to skip all content?
221 	if( skipall ) {
222 	    // already in a skip-all context. See if this is
223 	    // the same skipall tag:
224 	    if( !strcmp( tag_name.c_str(), tag_endskip.c_str() ) ) {
225 		++skipall;  // increment our nesting level count.
226 	    }
227 	}
228 	else {  // Should we begin a skip all range?
229 	    skipall = (skip_tags.have( tag_name.c_str() ) ? 1 : 0);
230 	    if( skipall ) {
231 		tag_endskip = tag_name;  // remember what tag to end on.
232 	    }
233 	}
234 	break;
236       case S_tag_gap: // 3. gap between attributes within an element:
237 	   s_tag_gap:
238       	switch( c ) {
239       	  case '>': goto all_end_tags;
241       	  case '=': in_what = S_attr_gap;
242 		    break; // uncommon - no-name attrib value
243       	  default:
244       	    if( asc_isspace( c ) ) break; // still in gap.
245       	    else {
246 		in_what = S_attr;   // start of attribute name;
247 		attrib_name.clear();
248       	 	attrib_name += asc_tolower( c );
249       	    }
250       	    break;
251       	 }
252       	 break;
254       case S_end:     // 9. Same as S_tag, but this is a </zee> type end tag.
255       	if( asc_isalpha(c) || asc_isdigit(c) ) {
256       	  tag_name += asc_tolower( c );
257       	}
258       	else {
259 	  // See if we have left a skipall tag range.
260 	  if( skipall && !strcmp( tag_name.c_str(), tag_endskip.c_str() ) ) {
261 	    --skipall; // lessen nesting level count. This usually takes us to zero.
262 	  }
263 	  if( c == '>' ) in_what = S_text;  // --don't go to all_end_tags.  Really.
264 	  else in_what = S_ignore_junk;  // no-mans land: </end whats this??>
265       	}
266       	break;
268       case S_ignore_junk:  // no-mans land state: </end whats this here??>
269       	if( c == '>' ) in_what = S_text;
270       	break;
272       case S_attr:   // 4. Looking at an attrib name
273       	if( asc_isspace(c) ) in_what = S_attr_gap;
274       	else if( c == '=' )  in_what = S_equals;
275 	else if( c == '>' )  goto all_end_tags;
276 	else {
277 	  attrib_name += asc_tolower( c );
278 	}
279 	break;
281       case S_attr_gap: // 5. optional gap after attrib name
282       	if( asc_isspace(c) ) break;
283       	else if( c == '=' )  in_what = S_equals;
284 	else if( c == '>' )  goto all_end_tags;
285 	else { // beginning of a brand new attr
286 	  attrib_name.clear();
287 	  attrib_name += asc_tolower( c );
288 	}
289 	break;
291       case S_equals:  // 6. Attrib equals sign, also space after the equals.
292       	if( asc_isspace(c) ) break;
293       	switch( c ) {
294       	  case '>':  goto all_end_tags;
296       	  case '\'':
297       	  case '"':  in_what = S_quoted;
298       	  	     quote_val = c;
299 		     break;
300       	  default:   in_what = S_value;
301 		     break;
302       	}
303       	// See if this attrib deserves full checking:
304       	include_attrib=check_attribs.have( attrib_name.c_str() );
305 	// Handle the first value char if that is where we are now:
306 	if( in_what == S_value ) goto s_value;
307 	break;
309       case S_value:   // 7. In attrib value.
310 	   s_value:
311       	if( c == '>' ) goto all_end_tags;
312       	else if( asc_isspace(c) ) in_what = S_tag_gap; // end of attrib value
313       				// *****************************
314       				// ********** RETVAL ASSIGNED
315       	else if( include_attrib ) retval = false; // spell check this value.
316       	break;
318       case S_quoted: // 8. In quoted attrib value.
319       	if( c == quote_val && lookbehind != '\\' ) in_what = S_tag_gap;
320       	else if( c == '\\' && lookbehind == '\\' ) {
321       		// This is an escape of an backslash. Therefore the backslash
322       		// does not escape what follows. Therefore we don't leave it in
323       		// the lookbehind. Yikes!
324       	  lookbehind = '\0';
325       	  return !include_attrib;      // ************* RETURN RETURN RETURN RETURN
326       	}
327       	else retval = !include_attrib;
328 	break;
330       // note: these three cases - S_ero, S_cro, and S_entity which all handle
331       //  the &stuff; constructs are broken into 3 states for future upgrades. Someday
332       //  you may want to handle the chars these guys represent as individual chars.
333       //  I don't have the desire nor the knowledge to do it now.  -Tom, 5/5/04.
334       case S_ero:     // 10. in the &code; special encoding within HTML.
335       		// &# is a 'Char Ref Open'
336       	if( c == '#' ) {
337 	  in_what = S_cro;
338 	  break;
339 	}
342       case S_cro:     // 12. after the # of a &#nnn; numerical char reference encoding.
343       case S_entity:  // 11. in the alpha named &nom; special encoding..
344 	if( asc_isalpha(c) || asc_isdigit(c) ) break; // more entity chars.
345 	in_what = S_text;
346       	if( c == ';' ) break;  // end of char code.
347 	goto s_text; // ran right into text. Handle it.
350       // SGML.. etc can have these special "declarations" within them. We skip them
351       //  in a more raw manners since they don't abide by the attrib= rules.
352       // Most importantly, some of the quoting rules don't apply.
353       //  <!ENTITY rtfchar "gg" 'tt' - - (high, low)>  <!--  fully commented -->
354       //   |               |    |                        ||                  ||
355       //   20              21   22                     23  24              25  26
356       case S_md:     // 20. In a declaration (or comment).
357       	switch( c ) {
358       	  case '-': if( lookbehind == '!' ) {
359 			in_what = S_com_1;
360 		    }
361 		    break;
363       	  case '"':     // fallthrough - yes.
364       	  case '\'': in_what = S_mdq;
365 		     quote_val=c;
366 		     break;
367       	  case '>':  in_what = S_text; // note: NOT all_end_tags cause it's not a real tag.
368 		     break;
369       	}
370       	break;
372     case S_mdq: // 22. Declaration in quotes.
373     	if( c == quote_val ) in_what = S_md;
374     	else if( c == '>' )  in_what = S_text;
375     	break;
377     case S_com_1:  // 23. perhaps a comment beginning.
378     	if( c == '-' ) in_what = S_com;
379     	else if( c == '>' ) in_what = S_text;
380     	else in_what = S_md; // out of possible comment.
381     	break;
383     case S_com:    // 24. Fully in a comment
384     	if( c == '-' && lookbehind == '-' ) in_what = S_com_e;
385     	break;
387     case S_com_e:  // 25. Perhaps ending a comment.
388     	if( c == '>' ) in_what = S_text;
389     	else if( c != '-' ) in_what = S_com;  // back to basic comment.
390     	break;
391     }
393     // update the lookbehind:
394     lookbehind = c;
396     return( retval );
397   }
process(FilterChar * & str,FilterChar * & stop)398 
399   void SgmlFilter::process(FilterChar * & str, FilterChar * & stop)
400   {
401     FilterChar * cur = str;
402     while (cur != stop) {
403       if (process_char(*cur))
404 	*cur = ' ';
405       ++cur;
406     }
407   }
409   //
410   //
411   //
413   class SgmlDecoder : public IndividualFilter
414   {
415     FilterCharVector buf;
416     String which;
SgmlDecoder(const char * n)417   public:
418     SgmlDecoder(const char * n) : which(n) {}
reset()419     PosibErr<bool> setup(Config *);
420     void reset() {}
421     void process(FilterChar * &, FilterChar * &);
422   };
setup(Config *)423 
424   PosibErr<bool> SgmlDecoder::setup(Config *)
425   {
426     name_ = which + "-decoder";
427     order_num_ = 0.65;
428     return true;
429   }
process(FilterChar * & start,FilterChar * & stop)430 
431   void SgmlDecoder::process(FilterChar * & start, FilterChar * & stop)
432   {
433     buf.clear();
434     FilterChar * i = start;
435     while (i != stop)
436     {
437       if (*i == '&') {
438 	FilterChar * i0 = i;
439 	FilterChar::Chr chr;
440 	++i;
441 	if (i != stop && *i == '#') {
442 	  chr = 0;
443 	  ++i;
444 	  while (i != stop && asc_isdigit(*i)) {
445 	    chr *= 10;
446 	    chr += *i - '0';
447 	    ++i;
448 	  }
449 	} else {
450 	  while (i != stop && (asc_isalpha(*i) || asc_isdigit(*i))) {
451 	    ++i;
452 	  }
453 	  chr = '?';
454 	}
455 	if (i != stop && *i == ';')
456 	  ++i;
457 	buf.append(FilterChar(chr, i0, i));
458       } else {
459 	buf.append(*i);
460 	++i;
461       }
462     }
463     buf.append('\0');
464     start = buf.pbegin();
465     stop  = buf.pend() - 1;
466   }
468   //
469   // Sgml Encoder - BROKEN do not use
470   //
472 //   class SgmlEncoder : public IndividualFilter
473 //   {
474 //     FilterCharVector buf;
475 //     String which;
476 //   public:
477 //     SgmlEncoder(const char * n) : which(n) {}
478 //     PosibErr<bool> setup(Config *);
479 //     void reset() {}
480 //     void process(FilterChar * &, FilterChar * &);
481 //   };
483 //   PosibErr<bool> SgmlEncoder::setup(Config *)
484 //   {
485 //     name_ = which + "-encoder";
486 //     order_num_ = 0.99;
487 //     return true;
488 //   }
490 //   void SgmlEncoder::process(FilterChar * & start, FilterChar * & stop)
491 //   {
492 //     buf.clear();
493 //     FilterChar * i = start;
494 //     while (i != stop)
495 //     {
496 //       if (*i > 127) {
497 // 	buf.append("&#", i->width);
498 // 	char b[10];
499 // 	sprintf(b, "%d", i->chr);
500 // 	buf.append(b, 0);
501 // 	buf.append(';', 0);
502 //       } else {
503 // 	buf.append(*i);
504 //       }
505 //       ++i;
506 //     }
507 //     buf.append('\0');
508 //     start = buf.pbegin();
509 //     stop  = buf.pend() - 1;
510 //   }
511 }
513 C_EXPORT IndividualFilter * new_aspell_sgml_filter()
514 {
515   return new SgmlFilter("sgml");
516 }
517 C_EXPORT IndividualFilter * new_aspell_sgml_decoder()
518 {
519   return new SgmlDecoder("sgml");
520 }
521 // C_EXPORT IndividualFilter * new_aspell_sgml_encoder()
522 // {
523 //   return new SgmlEncoder("sgml");
524 // }
526 C_EXPORT IndividualFilter * new_aspell_html_filter()
527 {
528   return new SgmlFilter("html");
529 }
530 C_EXPORT IndividualFilter * new_aspell_html_decoder()
531 {
532   return new SgmlDecoder("html");
533 }
534 // C_EXPORT IndividualFilter * new_aspell_html_encoder()
535 // {
536 //   return new SgmlEncoder("html");
537 // }
540 /* Example HTML:
542 <!--
543 This file contains several constructs that test the parsing and
544 handling of SGML/HTML/XML in sgml.cpp.
546 The only spelling errors you should see will be the word 'report this NNNN'.
547 There will be 22 of these.
549 run this by executing:
550 aspell pipe -H < sgmltest.html
552 WARNING: this is not really valid HTML. Don't display in a browser!
553 -->
555 <!-- phase 1 - SGML comments. -->
556 reportthiszphaseONE
557  <!-- ** 1.0 Valid comments... This file is full of them.  -->
558  <!-- ** 1.1 invalid open comment: -->
559 <!- not in a comment>reportthisyes</!->
561  <!-- ** 1.2 invalid close comment: -->
562 <!-- -- > spallwhat DON'T REPORT -> spallwhat DON'T REPORT -->
564 <!-- phase 1.5 - special entity encodings -->
565 reportthisphaseONEFIVE
566  &nbsp; don't&nbsp;report&nbsp;this
567  &#011; do not&#x20;report this.
568  do not&gt;report this.
569  this &amp; that.
571 <!-- phase 2 - special skip tags -->
572 reportthisphaseTWO
573 <SCRIPT> spallwhat DON'T REPORT </SCRIPT> reportthisyes
574 <style> spallwhat DON'T REPORT </style> reportthisyes
575 <STYLE something="yes yes"
576       > spallwhat DON'T REPORT </style > reportthisyes
577 <script/> reportthisyes  <!-- XHTML style terminated tag -->
578 <script someattrib=value/> reportthisyes  <!-- XHTML style terminated tag -->
579 <!-- Nested skip tags -->
580 <script> spallwhatnoreport <script> nonoreport </script><b>hello</b> nonoreport</script>reportthisyes
582 <!-- phase 3 - special 'include this' attributes -->
583 reportthisphaseTHREE
584 <tagname alt="image text reportthisyes" alt2=spallwhat altt="spallwhat don't report">
585 <tagname ALT="image text reportthisyes" ALT2=spallwhat AL="spallwhat don't report">
587 <!-- phase 4 - attribute value quoteing and escaping -->
588 reportthisphaseoneFOUR
589 <checkthis attribute111=simple/value.value >
590 <checkagain SOMEattrib   =   "whoa boy, mimimimspelled  ">
591 <singlequotes gotcha=   'singlypingly quoted///'>
592 <dblescaped gogogogo="dontcheck \">still in dontcheck\\\" still in dontcheck"> reportthisyes.
593 <dBLmore TomTomTomTom="so many escapes: \\\\\\\\"> reportthisyes.
594 <dblescaped gogogogo='dontcheck \'>still in dontcheck\\\' still in dontcheck'> reportthisyes.
595 <dBLmore TomTomTomTom='so many escapes: \\\\\\\\'> reportthisyes.
596 <mixnmatch scanhere='">dontcheck \"dontcheck \'dontcheck' alt=reportthisyes>
598 <!-- phase 5 - questionable (though all too common) constructs -->
599 reportthisphaseFIVE
600 <tag=dontreport> reportthisyes <tag hahahahhaha>reportthisyes
601 <!-- this one is from Yahoo! -->
602 <td width=1%><img src="http://wellll/thereeee/nowwww" alt="cool stuff">
603 <td width=1%><img src=http://wellll/thereeee/nowwww alt=real cool stuff>
605 */