1 //  link_check implementation  -----------------------------------------------//
2 
3 //  Copyright Beman Dawes 2002.
4 //
5 //  Distributed under the Boost Software License, Version 1.0.
6 //  (See accompanying file LICENSE_1_0.txt or copy at
7 //  http://www.boost.org/LICENSE_1_0.txt)
8 
9 #include <hpx/config/defines.hpp>
10 
11 #include "link_check.hpp"
12 #include "function_hyper.hpp"
13 #include "boost/regex.hpp"
14 #include "boost/filesystem/operations.hpp"
15 #include <boost/algorithm/string/case_conv.hpp>
16 #include <cstdlib>
17 #include <set>
18 
19 // #include <iostream>
20 
21 namespace fs = boost::filesystem;
22 
23 namespace
24 {
25   boost::regex html_bookmark_regex(
26     "<([^\\s<>]*)\\s*[^<>]*\\s+(NAME|ID)\\s*=\\s*(['\"])(.*?)\\3"
27     "|<!--.*?-->",
28     boost::regbase::normal | boost::regbase::icase);
29   boost::regex html_url_regex(
30     "<([^\\s<>]*)\\s*[^<>]*\\s+(?:HREF|SRC)" // HREF or SRC
31     "\\s*=\\s*(['\"])\\s*(.*?)\\s*\\2"
32     "|<!--.*?-->",
33     boost::regbase::normal | boost::regbase::icase);
34   boost::regex css_url_regex(
35     "(\\@import\\s*[\"']|url\\s*\\(\\s*[\"']?)([^\"')]*)"
36     "|/\\*.*?\\*/",
37     boost::regbase::normal | boost::regbase::icase);
38 
39   // Regular expression for parsing URLS from:
40   // http://tools.ietf.org/html/rfc3986#appendix-B
41   boost::regex url_decompose_regex(
42     "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?$",
43     boost::regbase::normal);
44 
45     typedef std::set<std::string> bookmark_set;
46     bookmark_set bookmarks;
47     bookmark_set bookmarks_lowercase; // duplicate check needs case insensitive
48 
49   // Decode html escapsed ampersands, returns an empty string if there's an error.
decode_ampersands(std::string const & url_path)50   std::string decode_ampersands(std::string const& url_path) {
51     std::string::size_type pos = 0, next;
52     std::string result;
53     result.reserve(url_path.length());
54 
55     while((next = url_path.find('&', pos)) != std::string::npos) {
56       result.append(url_path, pos, next - pos);
57       pos = next;
58       if(url_path.substr(pos, 5) == "&amp;") {
59         result += '&'; pos += 5;
60       }
61       else {
62         result += '&'; pos += 1;
63       }
64       break;
65     }
66 
67     result.append(url_path, pos, url_path.length());
68 
69     return result;
70   }
71 
72   // Decode percent encoded characters, returns an empty string if there's an error.
decode_percents(std::string const & url_path)73   std::string decode_percents(std::string const& url_path) {
74     std::string::size_type pos = 0, next;
75     std::string result;
76     result.reserve(url_path.length());
77 
78     while((next = url_path.find('%', pos)) != std::string::npos) {
79       result.append(url_path, pos, next - pos);
80       pos = next;
81       switch(url_path[pos]) {
82         case '%': {
83           if(url_path.length() - next < 3) return "";
84           char hex[3] = { url_path[next + 1], url_path[next + 2], '\0' };
85           char* end_ptr;
86           result += (char) std::strtol(hex, &end_ptr, 16);
87           if(*end_ptr) return "";
88           pos = next + 3;
89           break;
90         }
91       }
92     }
93 
94     result.append(url_path, pos, url_path.length());
95 
96     return result;
97   }
98 
is_css(const path & p)99   bool is_css(const path & p) {
100       return p.extension() == ".css";
101   }
102 
103 } // unnamed namespace
104 
105 namespace boost
106 {
107   namespace inspect
108   {
109 
110 //  link_check constructor  --------------------------------------------------//
111 
link_check()112    link_check::link_check()
113      : m_broken_errors(0), m_unlinked_errors(0), m_invalid_errors(0),
114        m_bookmark_errors(0), m_duplicate_bookmark_errors(0)
115    {
116        // HTML signatures are already registered by the base class,
117        // 'hypertext_inspector'
118        register_signature(".css");
119    }
120 
121 //  inspect (all)  -----------------------------------------------------------//
122 
inspect(const string &,const path & full_path)123    void link_check::inspect(
124       const string & /*library_name*/,
125       const path & full_path )
126     {
127       // keep track of paths already encountered to reduce disk activity
128       if ( !fs::is_directory( full_path ) )
129         m_paths[ relative_to( full_path, search_root_path() ) ] |= m_present;
130     }
131 
132 //  inspect ( .htm, .html, .shtml, .css )  -----------------------------------//
133 
inspect(const string & library_name,const path & full_path,const string & contents)134    void link_check::inspect(
135       const string & library_name,
136       const path & full_path,   // example: c:/foo/boost/filesystem/path.hpp
137       const string & contents )     // contents of file to be inspected
138     {
139       if (contents.find( "hpxinspect:" "nounlinked" ) != string::npos)
140           m_paths[ relative_to( full_path, search_root_path() ) ] |= m_nounlinked_errors;
141 
142       bool no_link_errors =
143           (contents.find( "hpxinspect:" "nolink" ) != string::npos);
144 
145       // build bookmarks databases
146       bookmarks.clear();
147       bookmarks_lowercase.clear();
148       string::const_iterator a_start( contents.begin() );
149       string::const_iterator a_end( contents.end() );
150       boost::match_results< string::const_iterator > a_what;
151       boost::match_flag_type a_flags = boost::match_default;
152 
153       if(!is_css(full_path))
154       {
155         string previous_id;
156 
157         while( boost::regex_search( a_start, a_end, a_what, html_bookmark_regex, a_flags) )
158         {
159           // a_what[0] contains the whole string iterators.
160           // a_what[1] contains the tag iterators.
161           // a_what[2] contains the attribute name.
162           // a_what[4] contains the bookmark iterators.
163 
164           if (a_what[4].matched)
165           {
166             string tag( a_what[1].first, a_what[1].second );
167             boost::algorithm::to_lower(tag);
168             string attribute( a_what[2].first, a_what[2].second );
169             boost::algorithm::to_lower(attribute);
170             string bookmark( a_what[4].first, a_what[4].second );
171 
172             bool name_following_id = ( attribute == "name" && previous_id == bookmark );
173             if ( tag != "meta" && attribute == "id" ) previous_id = bookmark;
174             else previous_id.clear();
175 
176             if ( tag != "meta" && !name_following_id )
177             {
178               bookmarks.insert( bookmark );
179 //              std::cout << "******************* " << bookmark << '\n';
180 
181               // w3.org recommends case-insensitive checking for duplicate bookmarks
182               // since some browsers do a case-insensitive match.
183               string bookmark_lowercase( bookmark );
184               boost::algorithm::to_lower(bookmark_lowercase);
185 
186               std::pair<bookmark_set::iterator, bool> result
187                 = bookmarks_lowercase.insert( bookmark_lowercase );
188               if (!result.second)
189               {
190                 ++m_duplicate_bookmark_errors;
191                 std::size_t ln = std::count( contents.begin(),
192                     a_what[3].first, '\n' ) + 1;
193                 error( library_name, full_path, "Duplicate bookmark: "
194                     + bookmark, ln );
195               }
196             }
197           }
198 
199           a_start = a_what[0].second; // update search position
200           a_flags |= boost::match_prev_avail; // update flags
201           a_flags |= boost::match_not_bob;
202         }
203       }
204 
205       // process urls
206       string::const_iterator start( contents.begin() );
207       string::const_iterator end( contents.end() );
208       boost::match_results< string::const_iterator > what;
209       boost::match_flag_type flags = boost::match_default;
210 
211       if(!is_css(full_path))
212       {
213         while( boost::regex_search( start, end, what, html_url_regex, flags) )
214         {
215           // what[0] contains the whole string iterators.
216           // what[1] contains the element type iterators.
217           // what[3] contains the URL iterators.
218 
219           if(what[3].matched)
220           {
221             string type( what[1].first, what[1].second );
222             boost::algorithm::to_lower(type);
223 
224             // TODO: Complain if 'link' tags use external stylesheets.
225             do_url( string( what[3].first, what[3].second ),
226               library_name, full_path, no_link_errors,
227               type == "a" || type == "link", contents.begin(), what[3].first );
228           }
229 
230           start = what[0].second; // update search position
231           flags |= boost::match_prev_avail; // update flags
232           flags |= boost::match_not_bob;
233         }
234       }
235 
236       while( boost::regex_search( start, end, what, css_url_regex, flags) )
237       {
238         // what[0] contains the whole string iterators.
239         // what[2] contains the URL iterators.
240 
241         if(what[2].matched)
242         {
243           do_url( string( what[2].first, what[2].second ),
244             library_name, full_path, no_link_errors, false,
245             contents.begin(), what[3].first );
246         }
247 
248         start = what[0].second; // update search position
249         flags |= boost::match_prev_avail; // update flags
250         flags |= boost::match_not_bob;
251       }
252     }
253 
254 //  do_url  ------------------------------------------------------------------//
255 
do_url(const string & url,const string & library_name,const path & source_path,bool no_link_errors,bool allow_external_content,std::string::const_iterator contents_begin,std::string::const_iterator url_start)256     void link_check::do_url( const string & url, const string & library_name,
257       const path & source_path, bool no_link_errors, bool allow_external_content,
258         std::string::const_iterator contents_begin,
259         std::string::const_iterator url_start )
260         // precondition: source_path.is_complete()
261     {
262       if(!no_link_errors && url.empty()) {
263         ++m_invalid_errors;
264         std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
265         error( library_name, source_path, "Empty URL.", ln );
266         return;
267       }
268 
269       // Decode ampersand encoded characters.
270       string decoded_url = is_css(source_path) ? url : decode_ampersands(url);
271       if(decoded_url.empty()) {
272         if(!no_link_errors) {
273           ++m_invalid_errors;
274           std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
275           error( library_name, source_path,
276             "Invalid URL (invalid ampersand encodings): " + url, ln );
277         }
278         return;
279       }
280 
281       boost::smatch m;
282       if(!boost::regex_match(decoded_url, m, url_decompose_regex)) {
283         if(!no_link_errors) {
284           ++m_invalid_errors;
285           std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
286           error( library_name, source_path, "Invalid URL: " + decoded_url, ln );
287         }
288         return;
289       }
290 
291       bool scheme_matched = m[2].matched,
292         authority_matched = m[4].matched,
293         //query_matched = m[7].matched,
294         fragment_matched = m[9].matched;
295 
296       std::string scheme(m[2]),
297         authority(m[4]),
298         url_path(m[5]),
299         //query(m[7]),
300         fragment(m[9]);
301 
302       // Check for external content
303       if(!allow_external_content && (authority_matched || scheme_matched)) {
304         if(!no_link_errors) {
305           ++m_invalid_errors;
306           std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
307           error( library_name, source_path, "External content: " + decoded_url, ln );
308         }
309       }
310 
311       // Protocol checks
312       if(scheme_matched) {
313         if(scheme == "http" || scheme == "https") {
314           // All http links should have a hostname. Generally if they don't
315           // it's by mistake. If they shouldn't, then a protocol isn't
316           // required.
317           if(!authority_matched) {
318             if(!no_link_errors) {
319               ++m_invalid_errors;
320               std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
321               error( library_name, source_path, "No hostname: " + decoded_url, ln );
322             }
323           }
324 
325           return;
326         }
327         else if(scheme == "file") {
328           if(!no_link_errors) {
329             ++m_invalid_errors;
330             std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
331             error( library_name, source_path,
332               "Invalid URL (hardwired file): " + decoded_url, ln );
333           }
334         }
335         else if(scheme == "mailto" || scheme == "ftp"
336             || scheme == "news" || scheme == "javascript") {
337           if ( !no_link_errors && is_css(source_path) ) {
338             ++m_invalid_errors;
339             std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
340             error( library_name, source_path,
341               "Invalid protocol for css: " + decoded_url, ln );
342           }
343         }
344         else {
345           if(!no_link_errors) {
346             ++m_invalid_errors;
347             std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
348             error( library_name, source_path,
349                 "Unknown protocol: '" + scheme + "' in url: " + decoded_url, ln );
350           }
351         }
352 
353         return;
354       }
355 
356       // Hostname without protocol.
357       if(authority_matched) {
358         if(!no_link_errors) {
359           ++m_invalid_errors;
360           std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
361           error( library_name, source_path,
362             "Invalid URL (hostname without protocol): " + decoded_url, ln );
363         }
364       }
365 
366       // Check the fragment identifier
367       if ( fragment_matched ) {
368         if ( is_css(source_path) ) {
369             if ( !no_link_errors ) {
370               ++m_invalid_errors;
371               std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
372               error( library_name, source_path,
373                 "Fragment link in CSS: " + decoded_url, ln );
374             }
375         }
376         else {
377           if ( !no_link_errors && fragment.find( '#' ) != string::npos )
378           {
379             ++m_bookmark_errors;
380             std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
381             error( library_name, source_path, "Invalid bookmark: " + decoded_url, ln );
382           }
383           else if ( !no_link_errors && url_path.empty() && !fragment.empty()
384             // w3.org recommends case-sensitive broken bookmark checking
385             // since some browsers do a case-sensitive match.
386             && bookmarks.find(decode_percents(fragment)) == bookmarks.end() )
387           {
388             ++m_broken_errors;
389             std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
390             error( library_name, source_path, "Unknown bookmark: " + decoded_url, ln );
391           }
392         }
393 
394         // No more to do if it's just a fragment identifier
395         if(url_path.empty()) return;
396       }
397 
398       // Detect characters banned by RFC2396:
399       if ( !no_link_errors
400           && decoded_url.find_first_of( " <>\"{}|\\^[]'" ) != string::npos )
401       {
402         ++m_invalid_errors;
403         std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
404         error( library_name, source_path,
405           "Invalid character in URL: " + decoded_url, ln );
406       }
407 
408       // Check that we actually have a path.
409       if(url_path.empty()) {
410         if(!no_link_errors) {
411           ++m_invalid_errors;
412           std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
413           error( library_name, source_path,
414             "Invalid URL (empty path in relative url): " + decoded_url, ln );
415         }
416       }
417 
418       // Decode percent encoded characters.
419       string decoded_path = decode_percents(url_path);
420       if(decoded_path.empty()) {
421         if(!no_link_errors) {
422           ++m_invalid_errors;
423           std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
424           error( library_name, source_path,
425             "Invalid URL (invalid character encodings): " + decoded_url, ln );
426         }
427         return;
428       }
429 
430       // strip url of references to current dir
431       if ( decoded_path[0]=='.' && decoded_path[1]=='/' ) decoded_path.erase( 0, 2 );
432 
433       // url is relative source_path.branch()
434       // convert to target_path, which is_complete()
435       path target_path;
436       try { target_path = source_path.branch_path() /= path( decoded_path ); }
437       catch ( const fs::filesystem_error & )
438       {
439         if(!no_link_errors) {
440           std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
441           ++m_invalid_errors;
442           error( library_name, source_path,
443             "Invalid URL (error resolving path): " + decoded_url, ln );
444         }
445         return;
446       }
447 
448       // create a m_paths entry if necessary
449       std::pair< const string, int > entry(
450         relative_to( target_path, search_root_path() ), 0 );
451       m_path_map::iterator itr( m_paths.find( entry.first ) );
452       if ( itr == m_paths.end() )
453       {
454         if ( fs::exists( target_path ) ) entry.second = m_present;
455         itr = m_paths.insert( entry ).first;
456       }
457 
458       // itr now points to the m_paths entry
459       itr->second |= m_linked_to;
460 
461       // if target isn't present, the link is broken
462       if ( !no_link_errors && (itr->second & m_present) == 0 )
463       {
464         ++m_broken_errors;
465         std::size_t ln = std::count( contents_begin, url_start, '\n' ) + 1;
466         error( library_name, source_path, "Broken link: " + decoded_url, ln );
467       }
468     }
469 
470 //  close  -------------------------------------------------------------------//
471 
close()472    void link_check::close()
473    {
474      for ( m_path_map::const_iterator itr = m_paths.begin();
475        itr != m_paths.end(); ++itr )
476      {
477 // std::clog << itr->first << " " << itr->second << "\n";
478        if ( (itr->second & m_linked_to) != m_linked_to
479          && (itr->second & m_nounlinked_errors) != m_nounlinked_errors
480          && (itr->first.rfind( ".html" ) == itr->first.size()-5
481           || itr->first.rfind( ".htm" ) == itr->first.size()-4
482           || itr->first.rfind( ".css" ) == itr->first.size()-4)
483          // because they may be redirectors, it is OK if these are unlinked:
484          && itr->first.rfind( "index.html" ) == string::npos
485          && itr->first.rfind( "index.htm" ) == string::npos )
486        {
487          ++m_unlinked_errors;
488          path full_path( search_root_path() / path(itr->first) );
489          error( impute_library( full_path ), full_path,
490              loclink(full_path, "Unlinked file") );
491        }
492      }
493    }
494 
495   } // namespace inspect
496 } // namespace boost
497 
498