1 // This file is part of The New Aspell
2 // Copyright (C) 2004 by Tom Snyder
3 // Copyright (C) 2001-2004 by Kevin Atkinson under the GNU LGPL license
4 // version 2.0 or 2.1.  You should have received a copy of the LGPL
5 // license along with this library if you did not you can find
6 // it at http://www.gnu.org/.
7 //
8 // The original filter was written by Kevin Atkinson.
9 // Tom Snyder rewrote the filter to support skipping SGML tags
10 //
11 // This filter enables the spell checking of sgml, html, and xhtml files
12 // by skipping the <elements> and such.
13 // The overall strategy is based on http://www.w3.org/Library/SGML.c.
14 // We don't use that code (nor the sourceforge 'expat' project code) for
15 // simplicity's sake.  We don't need to fully parse all aspects of HTML -
16 // we just need to skip and handle a few aspects. The w3.org code had too
17 // many linkages into their overall SGML/HTML processing engine.
18 //
19 // See the comment at the end of this file for examples of what we handle.
20 // See the config setting docs regarding our config lists: check and skip.
21 
22 #include <stdio.h> // needed for sprintf
23 
24 #include "settings.h"
25 
26 #include "asc_ctype.hpp"
27 #include "config.hpp"
28 #include "indiv_filter.hpp"
29 #include "string_map.hpp"
30 #include "mutable_container.hpp"
31 #include "clone_ptr-t.hpp"
32 #include "filter_char_vector.hpp"
33 
34 //right now unused option
35 //  static const KeyInfo sgml_options[] = {
36 //    {"sgml-extension", KeyInfoList, "html,htm,php,sgml",
37 //     N_("sgml file extensions")}
38 //  };
39 
40 namespace {
41 
42   using namespace acommon;
43 
44   class ToLowerMap : public StringMap
45   {
46   public:
add(ParmStr to_add)47     PosibErr<bool> add(ParmStr to_add) {
48       String new_key;
49       for (const char * i = to_add; *i; ++i) new_key += asc_tolower(*i);
50       return StringMap::add(new_key);
51     }
52 
remove(ParmStr to_rem)53     PosibErr<bool> remove(ParmStr to_rem) {
54       String new_key;
55       for (const char * i = to_rem; *i; ++i) new_key += asc_tolower(*i);
56       return StringMap::remove(new_key);
57     }
58   };
59 
60   class SgmlFilter : public IndividualFilter
61   {
62     // State enum. These states track where we are in the HTML/tag/element constructs.
63     // This diagram shows the main states. The marked number is the state we enter
64     // *after* we've read that char. Note that some of the state transitions handle
65     // illegal HTML such as <tag=what?>.
66     //
67     //  real text <tag attrib = this  attrib2='that'> &nbsp; </tag> &#123;
68     //   |        |   |  |   ||  |             |      |  |    |      |
69     //   1        2   3  4   56  7             8      10 11   9      12
70     enum ScanState {
71       S_text,   // 1. raw user text outside of any markup.
72       S_tag,    // 2. reading the 'tag' in <tag>
73       S_tag_gap,// 3. gap between attributes within an element:
74       S_attr,   // 4. Looking at an attrib name
75       S_attr_gap,// 5. optional gap after attrib name
76       S_equals,  // 6. Attrib equals sign, also space after the equals.
77       S_value,   // 7. In attrib value.
78       S_quoted,  // 8. In quoted attrib value.
79       S_end,     // 9. Same as S_tag, but this is a </zee> type end tag.
80       S_ignore_junk, // special case invalid area to ignore.
81       S_ero,     // 10. in the &code; special encoding within HTML.
82       S_entity,  // 11. in the alpha named &nom; special encoding..
83       S_cro,     // 12. after the # of a &#nnn; numerical char reference encoding.
84 
85     // SGML.. etc can have these special "declarations" within them. We skip them
86     //  in a more raw manners since they don't abide by the attrib= rules.
87     // Most importantly, some of the attrib quoting rules don't apply.
88     //  <!ENTITY rtfchar "gg" - - (high, low)>  <!--  fully commented -->
89     //   |               |                        ||                   |
90     //   20              21                     23  24                 25
91     S_md,     // 20. In a declaration (or beginning a comment).
92     S_mdq,    // 21. Declaration in quotes - double or single quotes.
93     S_com_1,  // 23. perhaps a comment beginning.
94     S_com,    // 24. Fully in a comment
95     S_com_e,  // 25. Perhaps ending a comment.
96 
97     //S_literal, // within a tag pair that means all content should be interpreted literally: <PRE>
98                // NOT CURRENTLY SUPPORTED FULLY.
99 
100     //S_esc,S_dollar,S_paren,S_nonasciitext // WOULD BE USED FOR ISO_2022_JP support.
101                                           // NOT CURRENTLY SUPPORTED.
102     };
103 
104     ScanState in_what;
105 	     // which quote char is quoting this attrib value.
106 
107     FilterChar::Chr  quote_val;
108 	    // one char prior to this one. For escape handling and such.
109     FilterChar::Chr  lookbehind;
110 
111     String tag_name;    // we accumulate the current tag name here.
112     String attrib_name; // we accumulate the current attribute name here.
113 
114     bool include_attrib;  // are we in one of the attribs that *should* be spell checked (alt=..)
115     int  skipall;         // are we in one of the special skip-all content tags? This is treated
116 			  // as a bool and as a nesting level count.
117     String tag_endskip;  // tag name that will end that.
118 
119     StringMap check_attribs; // list of attribs that we *should* spell check.
120     StringMap skip_tags;   // list of tags that start a no-check-at-all zone.
121 
122     String which;
123 
124     bool process_char(FilterChar::Chr c);
125 
126   public:
SgmlFilter(const char * n)127 
128     SgmlFilter(const char * n) : which(n) {}
129 
130     PosibErr<bool> setup(Config *);
131     void reset();
132     void process(FilterChar * &, FilterChar * &);
133   };
setup(Config * opts)134 
135   PosibErr<bool> SgmlFilter::setup(Config * opts)
136   {
137     name_ = which + "-filter";
138     order_num_ = 0.35;
139     check_attribs.clear();
140     skip_tags.clear();
141     opts->retrieve_list("f-" + which + "-skip",  &skip_tags);
142     opts->retrieve_list("f-" + which + "-check", &check_attribs);
143     reset();
144     return true;
145   }
reset()146 
147   void SgmlFilter::reset()
148   {
149     in_what = S_text;
150     quote_val = lookbehind = '\0';
151     skipall = 0;
152     include_attrib = false;
153   }
154 
155   // yes this should be inlines, it is only called once
156 
157   // RETURNS: TRUE if the caller should skip the passed char and
158   //  not do any spell check on it. FALSE if char is a part of the text
process_char(FilterChar::Chr c)159   //  of the document.
160   bool SgmlFilter::process_char(FilterChar::Chr c) {
161 
162     bool retval = true;  // DEFAULT RETURN VALUE. All returns are done
163     			 // via retval and falling out the bottom. Except for
164     			 // one case that must manage the lookbehind char.
165 
166     // PS: this switch will be fast since S_ABCs are an enum and
167     //  any good compiler will build a jump table for it.
168     // RE the gotos: Sometimes considered bad practice but that is
169     //  how the W3C code (1995) handles it. Could be done also with recursion
170     //  but I doubt that will clarify it. The gotos are done in cases where several
171     //  state changes occur on a single input char.
172 
173     switch( in_what ) {
174 
175       case S_text:   // 1. raw user text outside of any markup.
176 	   s_text:
177         switch( c ) {
178           case '&': in_what = S_ero;
179 		    break;
180           case '<': in_what = S_tag; tag_name.clear();
181 		    break;
182           default:
183                 retval = skipall;  // ********** RETVAL ASSIGNED
184         }			    // **************************
185         break;
186 
187       case S_tag:    // 2. reading the 'tag' in <tag>
188       		     //  heads up: <foo/bar will be treated as an end tag. That's what w3c does.
189         switch( c ) {
190 	  case '>': goto all_end_tags;
191           case '/': in_what = S_end;
192 		    tag_name.clear();
193 		    break;
194           case '!': in_what = S_md;
195 		    break;
196           default: // either more alphanum of the tag, or end of tagname:
197             if( asc_isalpha(c) || asc_isdigit(c) ) {
198                 tag_name += asc_tolower(c);
199             }
200             else {  // End of the tag:
201                 in_what = S_tag_gap;
202 		goto s_tag_gap;  // handle content in that zone.
203 	    }
204 	}
205 	break;
206 
207 	// '>'  '>'  '>'  '>'
208       all_end_tags:   // this gets called by several states to handle the
209 		       // possibility of a '>' ending a whole <tag...> guy.
210 	if( c != '>' ) break;
211 	in_what = S_text;
212 
213 	if( lookbehind == '/' ) {
214 	    // Wowza: this is how we handle the <script stuff /> XML style self
215 	    //  terminating tag. By clearing the tagname out tag-skip-all code
216 	    //  will not be invoked.
217 	    tag_name.clear();
218 	}
219 
220 	// Does this tag cause us to skip all content?
221 	if( skipall ) {
222 	    // already in a skip-all context. See if this is
223 	    // the same skipall tag:
224 	    if( !strcmp( tag_name.c_str(), tag_endskip.c_str() ) ) {
225 		++skipall;  // increment our nesting level count.
226 	    }
227 	}
228 	else {  // Should we begin a skip all range?
229 	    skipall = (skip_tags.have( tag_name.c_str() ) ? 1 : 0);
230 	    if( skipall ) {
231 		tag_endskip = tag_name;  // remember what tag to end on.
232 	    }
233 	}
234 	break;
235 
236       case S_tag_gap: // 3. gap between attributes within an element:
237 	   s_tag_gap:
238       	switch( c ) {
239       	  case '>': goto all_end_tags;
240 
241       	  case '=': in_what = S_attr_gap;
242 		    break; // uncommon - no-name attrib value
243       	  default:
244       	    if( asc_isspace( c ) ) break; // still in gap.
245       	    else {
246 		in_what = S_attr;   // start of attribute name;
247 		attrib_name.clear();
248       	 	attrib_name += asc_tolower( c );
249       	    }
250       	    break;
251       	 }
252       	 break;
253 
254       case S_end:     // 9. Same as S_tag, but this is a </zee> type end tag.
255       	if( asc_isalpha(c) || asc_isdigit(c) ) {
256       	  tag_name += asc_tolower( c );
257       	}
258       	else {
259 	  // See if we have left a skipall tag range.
260 	  if( skipall && !strcmp( tag_name.c_str(), tag_endskip.c_str() ) ) {
261 	    --skipall; // lessen nesting level count. This usually takes us to zero.
262 	  }
263 	  if( c == '>' ) in_what = S_text;  // --don't go to all_end_tags.  Really.
264 	  else in_what = S_ignore_junk;  // no-mans land: </end whats this??>
265       	}
266       	break;
267 
268       case S_ignore_junk:  // no-mans land state: </end whats this here??>
269       	if( c == '>' ) in_what = S_text;
270       	break;
271 
272       case S_attr:   // 4. Looking at an attrib name
273       	if( asc_isspace(c) ) in_what = S_attr_gap;
274       	else if( c == '=' )  in_what = S_equals;
275 	else if( c == '>' )  goto all_end_tags;
276 	else {
277 	  attrib_name += asc_tolower( c );
278 	}
279 	break;
280 
281       case S_attr_gap: // 5. optional gap after attrib name
282       	if( asc_isspace(c) ) break;
283       	else if( c == '=' )  in_what = S_equals;
284 	else if( c == '>' )  goto all_end_tags;
285 	else { // beginning of a brand new attr
286 	  attrib_name.clear();
287 	  attrib_name += asc_tolower( c );
288 	}
289 	break;
290 
291       case S_equals:  // 6. Attrib equals sign, also space after the equals.
292       	if( asc_isspace(c) ) break;
293       	switch( c ) {
294       	  case '>':  goto all_end_tags;
295 
296       	  case '\'':
297       	  case '"':  in_what = S_quoted;
298       	  	     quote_val = c;
299 		     break;
300       	  default:   in_what = S_value;
301 		     break;
302       	}
303       	// See if this attrib deserves full checking:
304       	include_attrib=check_attribs.have( attrib_name.c_str() );
305 	// Handle the first value char if that is where we are now:
306 	if( in_what == S_value ) goto s_value;
307 	break;
308 
309       case S_value:   // 7. In attrib value.
310 	   s_value:
311       	if( c == '>' ) goto all_end_tags;
312       	else if( asc_isspace(c) ) in_what = S_tag_gap; // end of attrib value
313       				// *****************************
314       				// ********** RETVAL ASSIGNED
315       	else if( include_attrib ) retval = false; // spell check this value.
316       	break;
317 
318       case S_quoted: // 8. In quoted attrib value.
319       	if( c == quote_val && lookbehind != '\\' ) in_what = S_tag_gap;
320       	else if( c == '\\' && lookbehind == '\\' ) {
321       		// This is an escape of an backslash. Therefore the backslash
322       		// does not escape what follows. Therefore we don't leave it in
323       		// the lookbehind. Yikes!
324       	  lookbehind = '\0';
325       	  return !include_attrib;      // ************* RETURN RETURN RETURN RETURN
326       	}
327       	else retval = !include_attrib;
328 	break;
329 
330       // note: these three cases - S_ero, S_cro, and S_entity which all handle
331       //  the &stuff; constructs are broken into 3 states for future upgrades. Someday
332       //  you may want to handle the chars these guys represent as individual chars.
333       //  I don't have the desire nor the knowledge to do it now.  -Tom, 5/5/04.
334       case S_ero:     // 10. in the &code; special encoding within HTML.
335       		// &# is a 'Char Ref Open'
336       	if( c == '#' ) {
337 	  in_what = S_cro;
338 	  break;
339 	}
340       	// FALLTHROUGH INTENTIONAL
341 
342       case S_cro:     // 12. after the # of a &#nnn; numerical char reference encoding.
343       case S_entity:  // 11. in the alpha named &nom; special encoding..
344 	if( asc_isalpha(c) || asc_isdigit(c) ) break; // more entity chars.
345 	in_what = S_text;
346       	if( c == ';' ) break;  // end of char code.
347 	goto s_text; // ran right into text. Handle it.
348 
349 
350       // SGML.. etc can have these special "declarations" within them. We skip them
351       //  in a more raw manners since they don't abide by the attrib= rules.
352       // Most importantly, some of the quoting rules don't apply.
353       //  <!ENTITY rtfchar "gg" 'tt' - - (high, low)>  <!--  fully commented -->
354       //   |               |    |                        ||                  ||
355       //   20              21   22                     23  24              25  26
356       case S_md:     // 20. In a declaration (or comment).
357       	switch( c ) {
358       	  case '-': if( lookbehind == '!' ) {
359 			in_what = S_com_1;
360 		    }
361 		    break;
362 
363       	  case '"':     // fallthrough - yes.
364       	  case '\'': in_what = S_mdq;
365 		     quote_val=c;
366 		     break;
367       	  case '>':  in_what = S_text; // note: NOT all_end_tags cause it's not a real tag.
368 		     break;
369       	}
370       	break;
371 
372     case S_mdq: // 22. Declaration in quotes.
373     	if( c == quote_val ) in_what = S_md;
374     	else if( c == '>' )  in_what = S_text;
375     	break;
376 
377     case S_com_1:  // 23. perhaps a comment beginning.
378     	if( c == '-' ) in_what = S_com;
379     	else if( c == '>' ) in_what = S_text;
380     	else in_what = S_md; // out of possible comment.
381     	break;
382 
383     case S_com:    // 24. Fully in a comment
384     	if( c == '-' && lookbehind == '-' ) in_what = S_com_e;
385     	break;
386 
387     case S_com_e:  // 25. Perhaps ending a comment.
388     	if( c == '>' ) in_what = S_text;
389     	else if( c != '-' ) in_what = S_com;  // back to basic comment.
390     	break;
391     }
392 
393     // update the lookbehind:
394     lookbehind = c;
395 
396     return( retval );
397   }
process(FilterChar * & str,FilterChar * & stop)398 
399   void SgmlFilter::process(FilterChar * & str, FilterChar * & stop)
400   {
401     FilterChar * cur = str;
402     while (cur != stop) {
403       if (process_char(*cur))
404 	*cur = ' ';
405       ++cur;
406     }
407   }
408 
409   //
410   //
411   //
412 
413   class SgmlDecoder : public IndividualFilter
414   {
415     FilterCharVector buf;
416     String which;
SgmlDecoder(const char * n)417   public:
418     SgmlDecoder(const char * n) : which(n) {}
reset()419     PosibErr<bool> setup(Config *);
420     void reset() {}
421     void process(FilterChar * &, FilterChar * &);
422   };
setup(Config *)423 
424   PosibErr<bool> SgmlDecoder::setup(Config *)
425   {
426     name_ = which + "-decoder";
427     order_num_ = 0.65;
428     return true;
429   }
process(FilterChar * & start,FilterChar * & stop)430 
431   void SgmlDecoder::process(FilterChar * & start, FilterChar * & stop)
432   {
433     buf.clear();
434     FilterChar * i = start;
435     while (i != stop)
436     {
437       if (*i == '&') {
438 	FilterChar * i0 = i;
439 	FilterChar::Chr chr;
440 	++i;
441 	if (i != stop && *i == '#') {
442 	  chr = 0;
443 	  ++i;
444 	  while (i != stop && asc_isdigit(*i)) {
445 	    chr *= 10;
446 	    chr += *i - '0';
447 	    ++i;
448 	  }
449 	} else {
450 	  while (i != stop && (asc_isalpha(*i) || asc_isdigit(*i))) {
451 	    ++i;
452 	  }
453 	  chr = '?';
454 	}
455 	if (i != stop && *i == ';')
456 	  ++i;
457 	buf.append(FilterChar(chr, i0, i));
458       } else {
459 	buf.append(*i);
460 	++i;
461       }
462     }
463     buf.append('\0');
464     start = buf.pbegin();
465     stop  = buf.pend() - 1;
466   }
467 
468   //
469   // Sgml Encoder - BROKEN do not use
470   //
471 
472 //   class SgmlEncoder : public IndividualFilter
473 //   {
474 //     FilterCharVector buf;
475 //     String which;
476 //   public:
477 //     SgmlEncoder(const char * n) : which(n) {}
478 //     PosibErr<bool> setup(Config *);
479 //     void reset() {}
480 //     void process(FilterChar * &, FilterChar * &);
481 //   };
482 
483 //   PosibErr<bool> SgmlEncoder::setup(Config *)
484 //   {
485 //     name_ = which + "-encoder";
486 //     order_num_ = 0.99;
487 //     return true;
488 //   }
489 
490 //   void SgmlEncoder::process(FilterChar * & start, FilterChar * & stop)
491 //   {
492 //     buf.clear();
493 //     FilterChar * i = start;
494 //     while (i != stop)
495 //     {
496 //       if (*i > 127) {
497 // 	buf.append("&#", i->width);
498 // 	char b[10];
499 // 	sprintf(b, "%d", i->chr);
500 // 	buf.append(b, 0);
501 // 	buf.append(';', 0);
502 //       } else {
503 // 	buf.append(*i);
504 //       }
505 //       ++i;
506 //     }
507 //     buf.append('\0');
508 //     start = buf.pbegin();
509 //     stop  = buf.pend() - 1;
510 //   }
511 }
new_aspell_sgml_filter()512 
513 C_EXPORT IndividualFilter * new_aspell_sgml_filter()
514 {
515   return new SgmlFilter("sgml");
516 }
517 C_EXPORT IndividualFilter * new_aspell_sgml_decoder()
518 {
519   return new SgmlDecoder("sgml");
520 }
521 // C_EXPORT IndividualFilter * new_aspell_sgml_encoder()
522 // {
523 //   return new SgmlEncoder("sgml");
524 // }
new_aspell_html_filter()525 
526 C_EXPORT IndividualFilter * new_aspell_html_filter()
527 {
528   return new SgmlFilter("html");
529 }
530 C_EXPORT IndividualFilter * new_aspell_html_decoder()
531 {
532   return new SgmlDecoder("html");
533 }
534 // C_EXPORT IndividualFilter * new_aspell_html_encoder()
535 // {
536 //   return new SgmlEncoder("html");
537 // }
538 
539 
540 /* Example HTML:
541 
542 <!--
543 This file contains several constructs that test the parsing and
544 handling of SGML/HTML/XML in sgml.cpp.
545 
546 The only spelling errors you should see will be the word 'report this NNNN'.
547 There will be 22 of these.
548 
549 run this by executing:
550 aspell pipe -H < sgmltest.html
551 
552 WARNING: this is not really valid HTML. Don't display in a browser!
553 -->
554 
555 <!-- phase 1 - SGML comments. -->
556 reportthiszphaseONE
557  <!-- ** 1.0 Valid comments... This file is full of them.  -->
558  <!-- ** 1.1 invalid open comment: -->
559 <!- not in a comment>reportthisyes</!->
560 
561  <!-- ** 1.2 invalid close comment: -->
562 <!-- -- > spallwhat DON'T REPORT -> spallwhat DON'T REPORT -->
563 
564 <!-- phase 1.5 - special entity encodings -->
565 reportthisphaseONEFIVE
566  &nbsp; don't&nbsp;report&nbsp;this
567  &#011; do not&#x20;report this.
568  do not&gt;report this.
569  this &amp; that.
570 
571 <!-- phase 2 - special skip tags -->
572 reportthisphaseTWO
573 <SCRIPT> spallwhat DON'T REPORT </SCRIPT> reportthisyes
574 <style> spallwhat DON'T REPORT </style> reportthisyes
575 <STYLE something="yes yes"
576       > spallwhat DON'T REPORT </style > reportthisyes
577 <script/> reportthisyes  <!-- XHTML style terminated tag -->
578 <script someattrib=value/> reportthisyes  <!-- XHTML style terminated tag -->
579 <!-- Nested skip tags -->
580 <script> spallwhatnoreport <script> nonoreport </script><b>hello</b> nonoreport</script>reportthisyes
581 
582 <!-- phase 3 - special 'include this' attributes -->
583 reportthisphaseTHREE
584 <tagname alt="image text reportthisyes" alt2=spallwhat altt="spallwhat don't report">
585 <tagname ALT="image text reportthisyes" ALT2=spallwhat AL="spallwhat don't report">
586 
587 <!-- phase 4 - attribute value quoteing and escaping -->
588 reportthisphaseoneFOUR
589 <checkthis attribute111=simple/value.value >
590 <checkagain SOMEattrib   =   "whoa boy, mimimimspelled  ">
591 <singlequotes gotcha=   'singlypingly quoted///'>
592 <dblescaped gogogogo="dontcheck \">still in dontcheck\\\" still in dontcheck"> reportthisyes.
593 <dBLmore TomTomTomTom="so many escapes: \\\\\\\\"> reportthisyes.
594 <dblescaped gogogogo='dontcheck \'>still in dontcheck\\\' still in dontcheck'> reportthisyes.
595 <dBLmore TomTomTomTom='so many escapes: \\\\\\\\'> reportthisyes.
596 <mixnmatch scanhere='">dontcheck \"dontcheck \'dontcheck' alt=reportthisyes>
597 
598 <!-- phase 5 - questionable (though all too common) constructs -->
599 reportthisphaseFIVE
600 <tag=dontreport> reportthisyes <tag hahahahhaha>reportthisyes
601 <!-- this one is from Yahoo! -->
602 <td width=1%><img src="http://wellll/thereeee/nowwww" alt="cool stuff">
603 <td width=1%><img src=http://wellll/thereeee/nowwww alt=real cool stuff>
604 
605 */
606