1 // This file is part of The New Aspell 2 // Copyright (C) 2004 by Tom Snyder 3 // Copyright (C) 2001-2004 by Kevin Atkinson under the GNU LGPL license 4 // version 2.0 or 2.1. You should have received a copy of the LGPL 5 // license along with this library if you did not you can find 6 // it at http://www.gnu.org/. 7 // 8 // The original filter was written by Kevin Atkinson. 9 // Tom Snyder rewrote the filter to support skipping SGML tags 10 // 11 // This filter enables the spell checking of sgml, html, and xhtml files 12 // by skipping the <elements> and such. 13 // The overall strategy is based on http://www.w3.org/Library/SGML.c. 14 // We don't use that code (nor the sourceforge 'expat' project code) for 15 // simplicity's sake. We don't need to fully parse all aspects of HTML - 16 // we just need to skip and handle a few aspects. The w3.org code had too 17 // many linkages into their overall SGML/HTML processing engine. 18 // 19 // See the comment at the end of this file for examples of what we handle. 20 // See the config setting docs regarding our config lists: check and skip. 21 22 #include <stdio.h> // needed for sprintf 23 24 #include "settings.h" 25 26 #include "asc_ctype.hpp" 27 #include "config.hpp" 28 #include "indiv_filter.hpp" 29 #include "string_map.hpp" 30 #include "mutable_container.hpp" 31 #include "clone_ptr-t.hpp" 32 #include "filter_char_vector.hpp" 33 34 //right now unused option 35 // static const KeyInfo sgml_options[] = { 36 // {"sgml-extension", KeyInfoList, "html,htm,php,sgml", 37 // N_("sgml file extensions")} 38 // }; 39 40 namespace { 41 42 using namespace acommon; 43 44 class ToLowerMap : public StringMap 45 { 46 public: add(ParmStr to_add)47 PosibErr<bool> add(ParmStr to_add) { 48 String new_key; 49 for (const char * i = to_add; *i; ++i) new_key += asc_tolower(*i); 50 return StringMap::add(new_key); 51 } 52 remove(ParmStr to_rem)53 PosibErr<bool> remove(ParmStr to_rem) { 54 String new_key; 55 for (const char * i = to_rem; *i; ++i) new_key += asc_tolower(*i); 56 return StringMap::remove(new_key); 57 } 58 }; 59 60 class SgmlFilter : public IndividualFilter 61 { 62 // State enum. These states track where we are in the HTML/tag/element constructs. 63 // This diagram shows the main states. The marked number is the state we enter 64 // *after* we've read that char. Note that some of the state transitions handle 65 // illegal HTML such as <tag=what?>. 66 // 67 // real text <tag attrib = this attrib2='that'> </tag> { 68 // | | | | || | | | | | | 69 // 1 2 3 4 56 7 8 10 11 9 12 70 enum ScanState { 71 S_text, // 1. raw user text outside of any markup. 72 S_tag, // 2. reading the 'tag' in <tag> 73 S_tag_gap,// 3. gap between attributes within an element: 74 S_attr, // 4. Looking at an attrib name 75 S_attr_gap,// 5. optional gap after attrib name 76 S_equals, // 6. Attrib equals sign, also space after the equals. 77 S_value, // 7. In attrib value. 78 S_quoted, // 8. In quoted attrib value. 79 S_end, // 9. Same as S_tag, but this is a </zee> type end tag. 80 S_ignore_junk, // special case invalid area to ignore. 81 S_ero, // 10. in the &code; special encoding within HTML. 82 S_entity, // 11. in the alpha named &nom; special encoding.. 83 S_cro, // 12. after the # of a &#nnn; numerical char reference encoding. 84 85 // SGML.. etc can have these special "declarations" within them. We skip them 86 // in a more raw manners since they don't abide by the attrib= rules. 87 // Most importantly, some of the attrib quoting rules don't apply. 88 // <!ENTITY rtfchar "gg" - - (high, low)> <!-- fully commented --> 89 // | | || | 90 // 20 21 23 24 25 91 S_md, // 20. In a declaration (or beginning a comment). 92 S_mdq, // 21. Declaration in quotes - double or single quotes. 93 S_com_1, // 23. perhaps a comment beginning. 94 S_com, // 24. Fully in a comment 95 S_com_e, // 25. Perhaps ending a comment. 96 97 //S_literal, // within a tag pair that means all content should be interpreted literally: <PRE> 98 // NOT CURRENTLY SUPPORTED FULLY. 99 100 //S_esc,S_dollar,S_paren,S_nonasciitext // WOULD BE USED FOR ISO_2022_JP support. 101 // NOT CURRENTLY SUPPORTED. 102 }; 103 104 ScanState in_what; 105 // which quote char is quoting this attrib value. 106 107 FilterChar::Chr quote_val; 108 // one char prior to this one. For escape handling and such. 109 FilterChar::Chr lookbehind; 110 111 String tag_name; // we accumulate the current tag name here. 112 String attrib_name; // we accumulate the current attribute name here. 113 114 bool include_attrib; // are we in one of the attribs that *should* be spell checked (alt=..) 115 int skipall; // are we in one of the special skip-all content tags? This is treated 116 // as a bool and as a nesting level count. 117 String tag_endskip; // tag name that will end that. 118 119 StringMap check_attribs; // list of attribs that we *should* spell check. 120 StringMap skip_tags; // list of tags that start a no-check-at-all zone. 121 122 String which; 123 124 bool process_char(FilterChar::Chr c); 125 126 public: SgmlFilter(const char * n)127 128 SgmlFilter(const char * n) : which(n) {} 129 130 PosibErr<bool> setup(Config *); 131 void reset(); 132 void process(FilterChar * &, FilterChar * &); 133 }; setup(Config * opts)134 135 PosibErr<bool> SgmlFilter::setup(Config * opts) 136 { 137 name_ = which + "-filter"; 138 order_num_ = 0.35; 139 check_attribs.clear(); 140 skip_tags.clear(); 141 opts->retrieve_list("f-" + which + "-skip", &skip_tags); 142 opts->retrieve_list("f-" + which + "-check", &check_attribs); 143 reset(); 144 return true; 145 } reset()146 147 void SgmlFilter::reset() 148 { 149 in_what = S_text; 150 quote_val = lookbehind = '\0'; 151 skipall = 0; 152 include_attrib = false; 153 } 154 155 // yes this should be inlines, it is only called once 156 157 // RETURNS: TRUE if the caller should skip the passed char and 158 // not do any spell check on it. FALSE if char is a part of the text process_char(FilterChar::Chr c)159 // of the document. 160 bool SgmlFilter::process_char(FilterChar::Chr c) { 161 162 bool retval = true; // DEFAULT RETURN VALUE. All returns are done 163 // via retval and falling out the bottom. Except for 164 // one case that must manage the lookbehind char. 165 166 // PS: this switch will be fast since S_ABCs are an enum and 167 // any good compiler will build a jump table for it. 168 // RE the gotos: Sometimes considered bad practice but that is 169 // how the W3C code (1995) handles it. Could be done also with recursion 170 // but I doubt that will clarify it. The gotos are done in cases where several 171 // state changes occur on a single input char. 172 173 switch( in_what ) { 174 175 case S_text: // 1. raw user text outside of any markup. 176 s_text: 177 switch( c ) { 178 case '&': in_what = S_ero; 179 break; 180 case '<': in_what = S_tag; tag_name.clear(); 181 break; 182 default: 183 retval = skipall; // ********** RETVAL ASSIGNED 184 } // ************************** 185 break; 186 187 case S_tag: // 2. reading the 'tag' in <tag> 188 // heads up: <foo/bar will be treated as an end tag. That's what w3c does. 189 switch( c ) { 190 case '>': goto all_end_tags; 191 case '/': in_what = S_end; 192 tag_name.clear(); 193 break; 194 case '!': in_what = S_md; 195 break; 196 default: // either more alphanum of the tag, or end of tagname: 197 if( asc_isalpha(c) || asc_isdigit(c) ) { 198 tag_name += asc_tolower(c); 199 } 200 else { // End of the tag: 201 in_what = S_tag_gap; 202 goto s_tag_gap; // handle content in that zone. 203 } 204 } 205 break; 206 207 // '>' '>' '>' '>' 208 all_end_tags: // this gets called by several states to handle the 209 // possibility of a '>' ending a whole <tag...> guy. 210 if( c != '>' ) break; 211 in_what = S_text; 212 213 if( lookbehind == '/' ) { 214 // Wowza: this is how we handle the <script stuff /> XML style self 215 // terminating tag. By clearing the tagname out tag-skip-all code 216 // will not be invoked. 217 tag_name.clear(); 218 } 219 220 // Does this tag cause us to skip all content? 221 if( skipall ) { 222 // already in a skip-all context. See if this is 223 // the same skipall tag: 224 if( !strcmp( tag_name.c_str(), tag_endskip.c_str() ) ) { 225 ++skipall; // increment our nesting level count. 226 } 227 } 228 else { // Should we begin a skip all range? 229 skipall = (skip_tags.have( tag_name.c_str() ) ? 1 : 0); 230 if( skipall ) { 231 tag_endskip = tag_name; // remember what tag to end on. 232 } 233 } 234 break; 235 236 case S_tag_gap: // 3. gap between attributes within an element: 237 s_tag_gap: 238 switch( c ) { 239 case '>': goto all_end_tags; 240 241 case '=': in_what = S_attr_gap; 242 break; // uncommon - no-name attrib value 243 default: 244 if( asc_isspace( c ) ) break; // still in gap. 245 else { 246 in_what = S_attr; // start of attribute name; 247 attrib_name.clear(); 248 attrib_name += asc_tolower( c ); 249 } 250 break; 251 } 252 break; 253 254 case S_end: // 9. Same as S_tag, but this is a </zee> type end tag. 255 if( asc_isalpha(c) || asc_isdigit(c) ) { 256 tag_name += asc_tolower( c ); 257 } 258 else { 259 // See if we have left a skipall tag range. 260 if( skipall && !strcmp( tag_name.c_str(), tag_endskip.c_str() ) ) { 261 --skipall; // lessen nesting level count. This usually takes us to zero. 262 } 263 if( c == '>' ) in_what = S_text; // --don't go to all_end_tags. Really. 264 else in_what = S_ignore_junk; // no-mans land: </end whats this??> 265 } 266 break; 267 268 case S_ignore_junk: // no-mans land state: </end whats this here??> 269 if( c == '>' ) in_what = S_text; 270 break; 271 272 case S_attr: // 4. Looking at an attrib name 273 if( asc_isspace(c) ) in_what = S_attr_gap; 274 else if( c == '=' ) in_what = S_equals; 275 else if( c == '>' ) goto all_end_tags; 276 else { 277 attrib_name += asc_tolower( c ); 278 } 279 break; 280 281 case S_attr_gap: // 5. optional gap after attrib name 282 if( asc_isspace(c) ) break; 283 else if( c == '=' ) in_what = S_equals; 284 else if( c == '>' ) goto all_end_tags; 285 else { // beginning of a brand new attr 286 attrib_name.clear(); 287 attrib_name += asc_tolower( c ); 288 } 289 break; 290 291 case S_equals: // 6. Attrib equals sign, also space after the equals. 292 if( asc_isspace(c) ) break; 293 switch( c ) { 294 case '>': goto all_end_tags; 295 296 case '\'': 297 case '"': in_what = S_quoted; 298 quote_val = c; 299 break; 300 default: in_what = S_value; 301 break; 302 } 303 // See if this attrib deserves full checking: 304 include_attrib=check_attribs.have( attrib_name.c_str() ); 305 // Handle the first value char if that is where we are now: 306 if( in_what == S_value ) goto s_value; 307 break; 308 309 case S_value: // 7. In attrib value. 310 s_value: 311 if( c == '>' ) goto all_end_tags; 312 else if( asc_isspace(c) ) in_what = S_tag_gap; // end of attrib value 313 // ***************************** 314 // ********** RETVAL ASSIGNED 315 else if( include_attrib ) retval = false; // spell check this value. 316 break; 317 318 case S_quoted: // 8. In quoted attrib value. 319 if( c == quote_val && lookbehind != '\\' ) in_what = S_tag_gap; 320 else if( c == '\\' && lookbehind == '\\' ) { 321 // This is an escape of an backslash. Therefore the backslash 322 // does not escape what follows. Therefore we don't leave it in 323 // the lookbehind. Yikes! 324 lookbehind = '\0'; 325 return !include_attrib; // ************* RETURN RETURN RETURN RETURN 326 } 327 else retval = !include_attrib; 328 break; 329 330 // note: these three cases - S_ero, S_cro, and S_entity which all handle 331 // the &stuff; constructs are broken into 3 states for future upgrades. Someday 332 // you may want to handle the chars these guys represent as individual chars. 333 // I don't have the desire nor the knowledge to do it now. -Tom, 5/5/04. 334 case S_ero: // 10. in the &code; special encoding within HTML. 335 // &# is a 'Char Ref Open' 336 if( c == '#' ) { 337 in_what = S_cro; 338 break; 339 } 340 // FALLTHROUGH INTENTIONAL 341 342 case S_cro: // 12. after the # of a &#nnn; numerical char reference encoding. 343 case S_entity: // 11. in the alpha named &nom; special encoding.. 344 if( asc_isalpha(c) || asc_isdigit(c) ) break; // more entity chars. 345 in_what = S_text; 346 if( c == ';' ) break; // end of char code. 347 goto s_text; // ran right into text. Handle it. 348 349 350 // SGML.. etc can have these special "declarations" within them. We skip them 351 // in a more raw manners since they don't abide by the attrib= rules. 352 // Most importantly, some of the quoting rules don't apply. 353 // <!ENTITY rtfchar "gg" 'tt' - - (high, low)> <!-- fully commented --> 354 // | | | || || 355 // 20 21 22 23 24 25 26 356 case S_md: // 20. In a declaration (or comment). 357 switch( c ) { 358 case '-': if( lookbehind == '!' ) { 359 in_what = S_com_1; 360 } 361 break; 362 363 case '"': // fallthrough - yes. 364 case '\'': in_what = S_mdq; 365 quote_val=c; 366 break; 367 case '>': in_what = S_text; // note: NOT all_end_tags cause it's not a real tag. 368 break; 369 } 370 break; 371 372 case S_mdq: // 22. Declaration in quotes. 373 if( c == quote_val ) in_what = S_md; 374 else if( c == '>' ) in_what = S_text; 375 break; 376 377 case S_com_1: // 23. perhaps a comment beginning. 378 if( c == '-' ) in_what = S_com; 379 else if( c == '>' ) in_what = S_text; 380 else in_what = S_md; // out of possible comment. 381 break; 382 383 case S_com: // 24. Fully in a comment 384 if( c == '-' && lookbehind == '-' ) in_what = S_com_e; 385 break; 386 387 case S_com_e: // 25. Perhaps ending a comment. 388 if( c == '>' ) in_what = S_text; 389 else if( c != '-' ) in_what = S_com; // back to basic comment. 390 break; 391 } 392 393 // update the lookbehind: 394 lookbehind = c; 395 396 return( retval ); 397 } process(FilterChar * & str,FilterChar * & stop)398 399 void SgmlFilter::process(FilterChar * & str, FilterChar * & stop) 400 { 401 FilterChar * cur = str; 402 while (cur != stop) { 403 if (process_char(*cur)) 404 *cur = ' '; 405 ++cur; 406 } 407 } 408 409 // 410 // 411 // 412 413 class SgmlDecoder : public IndividualFilter 414 { 415 FilterCharVector buf; 416 String which; SgmlDecoder(const char * n)417 public: 418 SgmlDecoder(const char * n) : which(n) {} reset()419 PosibErr<bool> setup(Config *); 420 void reset() {} 421 void process(FilterChar * &, FilterChar * &); 422 }; setup(Config *)423 424 PosibErr<bool> SgmlDecoder::setup(Config *) 425 { 426 name_ = which + "-decoder"; 427 order_num_ = 0.65; 428 return true; 429 } process(FilterChar * & start,FilterChar * & stop)430 431 void SgmlDecoder::process(FilterChar * & start, FilterChar * & stop) 432 { 433 buf.clear(); 434 FilterChar * i = start; 435 while (i != stop) 436 { 437 if (*i == '&') { 438 FilterChar * i0 = i; 439 FilterChar::Chr chr; 440 ++i; 441 if (i != stop && *i == '#') { 442 chr = 0; 443 ++i; 444 while (i != stop && asc_isdigit(*i)) { 445 chr *= 10; 446 chr += *i - '0'; 447 ++i; 448 } 449 } else { 450 while (i != stop && (asc_isalpha(*i) || asc_isdigit(*i))) { 451 ++i; 452 } 453 chr = '?'; 454 } 455 if (i != stop && *i == ';') 456 ++i; 457 buf.append(FilterChar(chr, i0, i)); 458 } else { 459 buf.append(*i); 460 ++i; 461 } 462 } 463 buf.append('\0'); 464 start = buf.pbegin(); 465 stop = buf.pend() - 1; 466 } 467 468 // 469 // Sgml Encoder - BROKEN do not use 470 // 471 472 // class SgmlEncoder : public IndividualFilter 473 // { 474 // FilterCharVector buf; 475 // String which; 476 // public: 477 // SgmlEncoder(const char * n) : which(n) {} 478 // PosibErr<bool> setup(Config *); 479 // void reset() {} 480 // void process(FilterChar * &, FilterChar * &); 481 // }; 482 483 // PosibErr<bool> SgmlEncoder::setup(Config *) 484 // { 485 // name_ = which + "-encoder"; 486 // order_num_ = 0.99; 487 // return true; 488 // } 489 490 // void SgmlEncoder::process(FilterChar * & start, FilterChar * & stop) 491 // { 492 // buf.clear(); 493 // FilterChar * i = start; 494 // while (i != stop) 495 // { 496 // if (*i > 127) { 497 // buf.append("&#", i->width); 498 // char b[10]; 499 // sprintf(b, "%d", i->chr); 500 // buf.append(b, 0); 501 // buf.append(';', 0); 502 // } else { 503 // buf.append(*i); 504 // } 505 // ++i; 506 // } 507 // buf.append('\0'); 508 // start = buf.pbegin(); 509 // stop = buf.pend() - 1; 510 // } 511 } new_aspell_sgml_filter()512 513 C_EXPORT IndividualFilter * new_aspell_sgml_filter() 514 { 515 return new SgmlFilter("sgml"); 516 } 517 C_EXPORT IndividualFilter * new_aspell_sgml_decoder() 518 { 519 return new SgmlDecoder("sgml"); 520 } 521 // C_EXPORT IndividualFilter * new_aspell_sgml_encoder() 522 // { 523 // return new SgmlEncoder("sgml"); 524 // } new_aspell_html_filter()525 526 C_EXPORT IndividualFilter * new_aspell_html_filter() 527 { 528 return new SgmlFilter("html"); 529 } 530 C_EXPORT IndividualFilter * new_aspell_html_decoder() 531 { 532 return new SgmlDecoder("html"); 533 } 534 // C_EXPORT IndividualFilter * new_aspell_html_encoder() 535 // { 536 // return new SgmlEncoder("html"); 537 // } 538 539 540 /* Example HTML: 541 542 <!-- 543 This file contains several constructs that test the parsing and 544 handling of SGML/HTML/XML in sgml.cpp. 545 546 The only spelling errors you should see will be the word 'report this NNNN'. 547 There will be 22 of these. 548 549 run this by executing: 550 aspell pipe -H < sgmltest.html 551 552 WARNING: this is not really valid HTML. Don't display in a browser! 553 --> 554 555 <!-- phase 1 - SGML comments. --> 556 reportthiszphaseONE 557 <!-- ** 1.0 Valid comments... This file is full of them. --> 558 <!-- ** 1.1 invalid open comment: --> 559 <!- not in a comment>reportthisyes</!-> 560 561 <!-- ** 1.2 invalid close comment: --> 562 <!-- -- > spallwhat DON'T REPORT -> spallwhat DON'T REPORT --> 563 564 <!-- phase 1.5 - special entity encodings --> 565 reportthisphaseONEFIVE 566 don't report this 567  do not report this. 568 do not>report this. 569 this & that. 570 571 <!-- phase 2 - special skip tags --> 572 reportthisphaseTWO 573 <SCRIPT> spallwhat DON'T REPORT </SCRIPT> reportthisyes 574 <style> spallwhat DON'T REPORT </style> reportthisyes 575 <STYLE something="yes yes" 576 > spallwhat DON'T REPORT </style > reportthisyes 577 <script/> reportthisyes <!-- XHTML style terminated tag --> 578 <script someattrib=value/> reportthisyes <!-- XHTML style terminated tag --> 579 <!-- Nested skip tags --> 580 <script> spallwhatnoreport <script> nonoreport </script><b>hello</b> nonoreport</script>reportthisyes 581 582 <!-- phase 3 - special 'include this' attributes --> 583 reportthisphaseTHREE 584 <tagname alt="image text reportthisyes" alt2=spallwhat altt="spallwhat don't report"> 585 <tagname ALT="image text reportthisyes" ALT2=spallwhat AL="spallwhat don't report"> 586 587 <!-- phase 4 - attribute value quoteing and escaping --> 588 reportthisphaseoneFOUR 589 <checkthis attribute111=simple/value.value > 590 <checkagain SOMEattrib = "whoa boy, mimimimspelled "> 591 <singlequotes gotcha= 'singlypingly quoted///'> 592 <dblescaped gogogogo="dontcheck \">still in dontcheck\\\" still in dontcheck"> reportthisyes. 593 <dBLmore TomTomTomTom="so many escapes: \\\\\\\\"> reportthisyes. 594 <dblescaped gogogogo='dontcheck \'>still in dontcheck\\\' still in dontcheck'> reportthisyes. 595 <dBLmore TomTomTomTom='so many escapes: \\\\\\\\'> reportthisyes. 596 <mixnmatch scanhere='">dontcheck \"dontcheck \'dontcheck' alt=reportthisyes> 597 598 <!-- phase 5 - questionable (though all too common) constructs --> 599 reportthisphaseFIVE 600 <tag=dontreport> reportthisyes <tag hahahahhaha>reportthisyes 601 <!-- this one is from Yahoo! --> 602 <td width=1%><img src="http://wellll/thereeee/nowwww" alt="cool stuff"> 603 <td width=1%><img src=http://wellll/thereeee/nowwww alt=real cool stuff> 604 605 */ 606