1 // link_check implementation -----------------------------------------------// 2 3 // Copyright Beman Dawes 2002. 4 // 5 // Distributed under the Boost Software License, Version 1.0. 6 // (See accompanying file LICENSE_1_0.txt or copy at 7 // http://www.boost.org/LICENSE_1_0.txt) 8 9 #include <hpx/config/defines.hpp> 10 11 #include "link_check.hpp" 12 #include "function_hyper.hpp" 13 #include "boost/regex.hpp" 14 #include "boost/filesystem/operations.hpp" 15 #include <boost/algorithm/string/case_conv.hpp> 16 #include <cstdlib> 17 #include <set> 18 19 // #include <iostream> 20 21 namespace fs = boost::filesystem; 22 23 namespace 24 { 25 boost::regex html_bookmark_regex( 26 "<([^\\s<>]*)\\s*[^<>]*\\s+(NAME|ID)\\s*=\\s*(['\"])(.*?)\\3" 27 "|<!--.*?-->", 28 boost::regbase::normal | boost::regbase::icase); 29 boost::regex html_url_regex( 30 "<([^\\s<>]*)\\s*[^<>]*\\s+(?:HREF|SRC)" // HREF or SRC 31 "\\s*=\\s*(['\"])\\s*(.*?)\\s*\\2" 32 "|<!--.*?-->", 33 boost::regbase::normal | boost::regbase::icase); 34 boost::regex css_url_regex( 35 "(\\@import\\s*[\"']|url\\s*\\(\\s*[\"']?)([^\"')]*)" 36 "|/\\*.*?\\*/", 37 boost::regbase::normal | boost::regbase::icase); 38 39 // Regular expression for parsing URLS from: 40 // http://tools.ietf.org/html/rfc3986#appendix-B 41 boost::regex url_decompose_regex( 42 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?$", 43 boost::regbase::normal); 44 45 typedef std::set<std::string> bookmark_set; 46 bookmark_set bookmarks; 47 bookmark_set bookmarks_lowercase; // duplicate check needs case insensitive 48 49 // Decode html escapsed ampersands, returns an empty string if there's an error. decode_ampersands(std::string const & url_path)50 std::string decode_ampersands(std::string const& url_path) { 51 std::string::size_type pos = 0, next; 52 std::string result; 53 result.reserve(url_path.length()); 54 55 while((next = url_path.find('&', pos)) != std::string::npos) { 56 result.append(url_path, pos, next - pos); 57 pos = next; 58 if(url_path.substr(pos, 5) == "&") { 59 result += '&'; pos += 5; 60 } 61 else { 62 result += '&'; pos += 1; 63 } 64 break; 65 } 66 67 result.append(url_path, pos, url_path.length()); 68 69 return result; 70 } 71 72 // Decode percent encoded characters, returns an empty string if there's an error. decode_percents(std::string const & url_path)73 std::string decode_percents(std::string const& url_path) { 74 std::string::size_type pos = 0, next; 75 std::string result; 76 result.reserve(url_path.length()); 77 78 while((next = url_path.find('%', pos)) != std::string::npos) { 79 result.append(url_path, pos, next - pos); 80 pos = next; 81 switch(url_path[pos]) { 82 case '%': { 83 if(url_path.length() - next < 3) return ""; 84 char hex[3] = { url_path[next + 1], url_path[next + 2], '\0' }; 85 char* end_ptr; 86 result += (char) std::strtol(hex, &end_ptr, 16); 87 if(*end_ptr) return ""; 88 pos = next + 3; 89 break; 90 } 91 } 92 } 93 94 result.append(url_path, pos, url_path.length()); 95 96 return result; 97 } 98 is_css(const path & p)99 bool is_css(const path & p) { 100 return p.extension() == ".css"; 101 } 102 103 } // unnamed namespace 104 105 namespace boost 106 { 107 namespace inspect 108 { 109 110 // link_check constructor --------------------------------------------------// 111 link_check()112 link_check::link_check() 113 : m_broken_errors(0), m_unlinked_errors(0), m_invalid_errors(0), 114 m_bookmark_errors(0), m_duplicate_bookmark_errors(0) 115 { 116 // HTML signatures are already registered by the base class, 117 // 'hypertext_inspector' 118 register_signature(".css"); 119 } 120 121 // inspect (all) -----------------------------------------------------------// 122 inspect(const string &,const path & full_path)123 void link_check::inspect( 124 const string & /*library_name*/, 125 const path & full_path ) 126 { 127 // keep track of paths already encountered to reduce disk activity 128 if ( !fs::is_directory( full_path ) ) 129 m_paths[ relative_to( full_path, search_root_path() ) ] |= m_present; 130 } 131 132 // inspect ( .htm, .html, .shtml, .css ) -----------------------------------// 133 inspect(const string & library_name,const path & full_path,const string & contents)134 void link_check::inspect( 135 const string & library_name, 136 const path & full_path, // example: c:/foo/boost/filesystem/path.hpp 137 const string & contents ) // contents of file to be inspected 138 { 139 if (contents.find( "hpxinspect:" "nounlinked" ) != string::npos) 140 m_paths[ relative_to( full_path, search_root_path() ) ] |= m_nounlinked_errors; 141 142 bool no_link_errors = 143 (contents.find( "hpxinspect:" "nolink" ) != string::npos); 144 145 // build bookmarks databases 146 bookmarks.clear(); 147 bookmarks_lowercase.clear(); 148 string::const_iterator a_start( contents.begin() ); 149 string::const_iterator a_end( contents.end() ); 150 boost::match_results< string::const_iterator > a_what; 151 boost::match_flag_type a_flags = boost::match_default; 152 153 if(!is_css(full_path)) 154 { 155 string previous_id; 156 157 while( boost::regex_search( a_start, a_end, a_what, html_bookmark_regex, a_flags) ) 158 { 159 // a_what[0] contains the whole string iterators. 160 // a_what[1] contains the tag iterators. 161 // a_what[2] contains the attribute name. 162 // a_what[4] contains the bookmark iterators. 163 164 if (a_what[4].matched) 165 { 166 string tag( a_what[1].first, a_what[1].second ); 167 boost::algorithm::to_lower(tag); 168 string attribute( a_what[2].first, a_what[2].second ); 169 boost::algorithm::to_lower(attribute); 170 string bookmark( a_what[4].first, a_what[4].second ); 171 172 bool name_following_id = ( attribute == "name" && previous_id == bookmark ); 173 if ( tag != "meta" && attribute == "id" ) previous_id = bookmark; 174 else previous_id.clear(); 175 176 if ( tag != "meta" && !name_following_id ) 177 { 178 bookmarks.insert( bookmark ); 179 // std::cout << "******************* " << bookmark << '\n'; 180 181 // w3.org recommends case-insensitive checking for duplicate bookmarks 182 // since some browsers do a case-insensitive match. 183 string bookmark_lowercase( bookmark ); 184 boost::algorithm::to_lower(bookmark_lowercase); 185 186 std::pair<bookmark_set::iterator, bool> result 187 = bookmarks_lowercase.insert( bookmark_lowercase ); 188 if (!result.second) 189 { 190 ++m_duplicate_bookmark_errors; 191 std::size_t ln = std::count( contents.begin(), 192 a_what[3].first, '\n' ) + 1; 193 error( library_name, full_path, "Duplicate bookmark: " 194 + bookmark, ln ); 195 } 196 } 197 } 198 199 a_start = a_what[0].second; // update search position 200 a_flags |= boost::match_prev_avail; // update flags 201 a_flags |= boost::match_not_bob; 202 } 203 } 204 205 // process urls 206 string::const_iterator start( contents.begin() ); 207 string::const_iterator end( contents.end() ); 208 boost::match_results< string::const_iterator > what; 209 boost::match_flag_type flags = boost::match_default; 210 211 if(!is_css(full_path)) 212 { 213 while( boost::regex_search( start, end, what, html_url_regex, flags) ) 214 { 215 // what[0] contains the whole string iterators. 216 // what[1] contains the element type iterators. 217 // what[3] contains the URL iterators. 218 219 if(what[3].matched) 220 { 221 string type( what[1].first, what[1].second ); 222 boost::algorithm::to_lower(type); 223 224 // TODO: Complain if 'link' tags use external stylesheets. 225 do_url( string( what[3].first, what[3].second ), 226 library_name, full_path, no_link_errors, 227 type == "a" || type == "link", contents.begin(), what[3].first ); 228 } 229 230 start = what[0].second; // update search position 231 flags |= boost::match_prev_avail; // update flags 232 flags |= boost::match_not_bob; 233 } 234 } 235 236 while( boost::regex_search( start, end, what, css_url_regex, flags) ) 237 { 238 // what[0] contains the whole string iterators. 239 // what[2] contains the URL iterators. 240 241 if(what[2].matched) 242 { 243 do_url( string( what[2].first, what[2].second ), 244 library_name, full_path, no_link_errors, false, 245 contents.begin(), what[3].first ); 246 } 247 248 start = what[0].second; // update search position 249 flags |= boost::match_prev_avail; // update flags 250 flags |= boost::match_not_bob; 251 } 252 } 253 254 // do_url ------------------------------------------------------------------// 255 do_url(const string & url,const string & library_name,const path & source_path,bool no_link_errors,bool allow_external_content,std::string::const_iterator contents_begin,std::string::const_iterator url_start)256 void link_check::do_url( const string & url, const string & library_name, 257 const path & source_path, bool no_link_errors, bool allow_external_content, 258 std::string::const_iterator contents_begin, 259 std::string::const_iterator url_start ) 260 // precondition: source_path.is_complete() 261 { 262 if(!no_link_errors && url.empty()) { 263 ++m_invalid_errors; 264 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 265 error( library_name, source_path, "Empty URL.", ln ); 266 return; 267 } 268 269 // Decode ampersand encoded characters. 270 string decoded_url = is_css(source_path) ? url : decode_ampersands(url); 271 if(decoded_url.empty()) { 272 if(!no_link_errors) { 273 ++m_invalid_errors; 274 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 275 error( library_name, source_path, 276 "Invalid URL (invalid ampersand encodings): " + url, ln ); 277 } 278 return; 279 } 280 281 boost::smatch m; 282 if(!boost::regex_match(decoded_url, m, url_decompose_regex)) { 283 if(!no_link_errors) { 284 ++m_invalid_errors; 285 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 286 error( library_name, source_path, "Invalid URL: " + decoded_url, ln ); 287 } 288 return; 289 } 290 291 bool scheme_matched = m[2].matched, 292 authority_matched = m[4].matched, 293 //query_matched = m[7].matched, 294 fragment_matched = m[9].matched; 295 296 std::string scheme(m[2]), 297 authority(m[4]), 298 url_path(m[5]), 299 //query(m[7]), 300 fragment(m[9]); 301 302 // Check for external content 303 if(!allow_external_content && (authority_matched || scheme_matched)) { 304 if(!no_link_errors) { 305 ++m_invalid_errors; 306 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 307 error( library_name, source_path, "External content: " + decoded_url, ln ); 308 } 309 } 310 311 // Protocol checks 312 if(scheme_matched) { 313 if(scheme == "http" || scheme == "https") { 314 // All http links should have a hostname. Generally if they don't 315 // it's by mistake. If they shouldn't, then a protocol isn't 316 // required. 317 if(!authority_matched) { 318 if(!no_link_errors) { 319 ++m_invalid_errors; 320 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 321 error( library_name, source_path, "No hostname: " + decoded_url, ln ); 322 } 323 } 324 325 return; 326 } 327 else if(scheme == "file") { 328 if(!no_link_errors) { 329 ++m_invalid_errors; 330 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 331 error( library_name, source_path, 332 "Invalid URL (hardwired file): " + decoded_url, ln ); 333 } 334 } 335 else if(scheme == "mailto" || scheme == "ftp" 336 || scheme == "news" || scheme == "javascript") { 337 if ( !no_link_errors && is_css(source_path) ) { 338 ++m_invalid_errors; 339 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 340 error( library_name, source_path, 341 "Invalid protocol for css: " + decoded_url, ln ); 342 } 343 } 344 else { 345 if(!no_link_errors) { 346 ++m_invalid_errors; 347 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 348 error( library_name, source_path, 349 "Unknown protocol: '" + scheme + "' in url: " + decoded_url, ln ); 350 } 351 } 352 353 return; 354 } 355 356 // Hostname without protocol. 357 if(authority_matched) { 358 if(!no_link_errors) { 359 ++m_invalid_errors; 360 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 361 error( library_name, source_path, 362 "Invalid URL (hostname without protocol): " + decoded_url, ln ); 363 } 364 } 365 366 // Check the fragment identifier 367 if ( fragment_matched ) { 368 if ( is_css(source_path) ) { 369 if ( !no_link_errors ) { 370 ++m_invalid_errors; 371 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 372 error( library_name, source_path, 373 "Fragment link in CSS: " + decoded_url, ln ); 374 } 375 } 376 else { 377 if ( !no_link_errors && fragment.find( '#' ) != string::npos ) 378 { 379 ++m_bookmark_errors; 380 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 381 error( library_name, source_path, "Invalid bookmark: " + decoded_url, ln ); 382 } 383 else if ( !no_link_errors && url_path.empty() && !fragment.empty() 384 // w3.org recommends case-sensitive broken bookmark checking 385 // since some browsers do a case-sensitive match. 386 && bookmarks.find(decode_percents(fragment)) == bookmarks.end() ) 387 { 388 ++m_broken_errors; 389 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 390 error( library_name, source_path, "Unknown bookmark: " + decoded_url, ln ); 391 } 392 } 393 394 // No more to do if it's just a fragment identifier 395 if(url_path.empty()) return; 396 } 397 398 // Detect characters banned by RFC2396: 399 if ( !no_link_errors 400 && decoded_url.find_first_of( " <>\"{}|\\^[]'" ) != string::npos ) 401 { 402 ++m_invalid_errors; 403 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 404 error( library_name, source_path, 405 "Invalid character in URL: " + decoded_url, ln ); 406 } 407 408 // Check that we actually have a path. 409 if(url_path.empty()) { 410 if(!no_link_errors) { 411 ++m_invalid_errors; 412 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 413 error( library_name, source_path, 414 "Invalid URL (empty path in relative url): " + decoded_url, ln ); 415 } 416 } 417 418 // Decode percent encoded characters. 419 string decoded_path = decode_percents(url_path); 420 if(decoded_path.empty()) { 421 if(!no_link_errors) { 422 ++m_invalid_errors; 423 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 424 error( library_name, source_path, 425 "Invalid URL (invalid character encodings): " + decoded_url, ln ); 426 } 427 return; 428 } 429 430 // strip url of references to current dir 431 if ( decoded_path[0]=='.' && decoded_path[1]=='/' ) decoded_path.erase( 0, 2 ); 432 433 // url is relative source_path.branch() 434 // convert to target_path, which is_complete() 435 path target_path; 436 try { target_path = source_path.branch_path() /= path( decoded_path ); } 437 catch ( const fs::filesystem_error & ) 438 { 439 if(!no_link_errors) { 440 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 441 ++m_invalid_errors; 442 error( library_name, source_path, 443 "Invalid URL (error resolving path): " + decoded_url, ln ); 444 } 445 return; 446 } 447 448 // create a m_paths entry if necessary 449 std::pair< const string, int > entry( 450 relative_to( target_path, search_root_path() ), 0 ); 451 m_path_map::iterator itr( m_paths.find( entry.first ) ); 452 if ( itr == m_paths.end() ) 453 { 454 if ( fs::exists( target_path ) ) entry.second = m_present; 455 itr = m_paths.insert( entry ).first; 456 } 457 458 // itr now points to the m_paths entry 459 itr->second |= m_linked_to; 460 461 // if target isn't present, the link is broken 462 if ( !no_link_errors && (itr->second & m_present) == 0 ) 463 { 464 ++m_broken_errors; 465 std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1; 466 error( library_name, source_path, "Broken link: " + decoded_url, ln ); 467 } 468 } 469 470 // close -------------------------------------------------------------------// 471 close()472 void link_check::close() 473 { 474 for ( m_path_map::const_iterator itr = m_paths.begin(); 475 itr != m_paths.end(); ++itr ) 476 { 477 // std::clog << itr->first << " " << itr->second << "\n"; 478 if ( (itr->second & m_linked_to) != m_linked_to 479 && (itr->second & m_nounlinked_errors) != m_nounlinked_errors 480 && (itr->first.rfind( ".html" ) == itr->first.size()-5 481 || itr->first.rfind( ".htm" ) == itr->first.size()-4 482 || itr->first.rfind( ".css" ) == itr->first.size()-4) 483 // because they may be redirectors, it is OK if these are unlinked: 484 && itr->first.rfind( "index.html" ) == string::npos 485 && itr->first.rfind( "index.htm" ) == string::npos ) 486 { 487 ++m_unlinked_errors; 488 path full_path( search_root_path() / path(itr->first) ); 489 error( impute_library( full_path ), full_path, 490 loclink(full_path, "Unlinked file") ); 491 } 492 } 493 } 494 495 } // namespace inspect 496 } // namespace boost 497 498