1 //  link_check implementation  -----------------------------------------------//
2 
3 //  Copyright Beman Dawes 2002.
4 //
5 //  Distributed under the Boost Software License, Version 1.0.
6 //  (See accompanying file LICENSE_1_0.txt or copy at
7 //  http://www.boost.org/LICENSE_1_0.txt)
8 
9 #include "link_check.hpp"
10 #include "boost/regex.hpp"
11 #include "boost/filesystem/operations.hpp"
12 #include <boost/algorithm/string/case_conv.hpp>
13 #include <cstdlib>
14 #include <set>
15 
16 // #include <iostream>
17 
18 namespace fs = boost::filesystem;
19 
20 namespace
21 {
22   boost::regex html_bookmark_regex(
23     "<([^\\s<>]*)\\s*[^<>]*\\s+(NAME|ID)\\s*=\\s*(['\"])(.*?)\\3"
24     "|<!--.*?-->",
25     boost::regbase::normal | boost::regbase::icase);
26   boost::regex html_url_regex(
27     "<([^\\s<>]*)\\s*[^<>]*\\s+(?:HREF|SRC)" // HREF or SRC
28     "\\s*=\\s*(['\"])\\s*(.*?)\\s*\\2"
29     "|<!--.*?-->",
30     boost::regbase::normal | boost::regbase::icase);
31   boost::regex css_url_regex(
32     "(\\@import\\s*[\"']|url\\s*\\(\\s*[\"']?)([^\"')]*)"
33     "|/\\*.*?\\*/",
34     boost::regbase::normal | boost::regbase::icase);
35 
36   // Regular expression for parsing URLS from:
37   // http://tools.ietf.org/html/rfc3986#appendix-B
38   boost::regex url_decompose_regex(
39     "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?$",
40     boost::regbase::normal);
41 
42     typedef std::set<std::string> bookmark_set;
43     bookmark_set bookmarks;
44     bookmark_set bookmarks_lowercase; // duplicate check needs case insensitive
45 
46   // Decode html escapsed ampersands, returns an empty string if there's an error.
decode_ampersands(std::string const & url_path)47   std::string decode_ampersands(std::string const& url_path) {
48     std::string::size_type pos = 0, next;
49     std::string result;
50     result.reserve(url_path.length());
51 
52     while((next = url_path.find('&', pos)) != std::string::npos) {
53       result.append(url_path, pos, next - pos);
54       pos = next;
55       if(url_path.substr(pos, 5) == "&amp;") {
56         result += '&'; pos += 5;
57       }
58       else {
59         result += '&'; pos += 1;
60       }
61       break;
62     }
63 
64     result.append(url_path, pos, url_path.length());
65 
66     return result;
67   }
68 
69   // Decode percent encoded characters, returns an empty string if there's an error.
decode_percents(std::string const & url_path)70   std::string decode_percents(std::string const& url_path) {
71     std::string::size_type pos = 0, next;
72     std::string result;
73     result.reserve(url_path.length());
74 
75     while((next = url_path.find('%', pos)) != std::string::npos) {
76       result.append(url_path, pos, next - pos);
77       pos = next;
78       switch(url_path[pos]) {
79         case '%': {
80           if(url_path.length() - next < 3) return "";
81           char hex[3] = { url_path[next + 1], url_path[next + 2], '\0' };
82           char* end_ptr;
83           result += (char) std::strtol(hex, &end_ptr, 16);
84           if(*end_ptr) return "";
85           pos = next + 3;
86           break;
87         }
88       }
89     }
90 
91     result.append(url_path, pos, url_path.length());
92 
93     return result;
94   }
95 
is_css(const path & p)96   bool is_css(const path & p) {
97       return p.extension() == ".css";
98   }
99 
100 } // unnamed namespace
101 
102 namespace boost
103 {
104   namespace inspect
105   {
106 
107 //  link_check constructor  --------------------------------------------------//
108 
link_check()109    link_check::link_check()
110      : m_broken_errors(0), m_unlinked_errors(0), m_invalid_errors(0),
111        m_bookmark_errors(0), m_duplicate_bookmark_errors(0)
112    {
113        // HTML signatures are already registered by the base class,
114        // 'hypertext_inspector'
115        register_signature(".css");
116    }
117 
118 //  inspect (all)  -----------------------------------------------------------//
119 
inspect(const string &,const path & full_path)120    void link_check::inspect(
121       const string & /*library_name*/,
122       const path & full_path )
123     {
124       // keep track of paths already encountered to reduce disk activity
125       if ( !fs::is_directory( full_path ) )
126         m_paths[ relative_to( full_path, search_root_path() ) ] |= m_present;
127     }
128 
129 //  inspect ( .htm, .html, .shtml, .css )  -----------------------------------//
130 
inspect(const string & library_name,const path & full_path,const string & contents)131    void link_check::inspect(
132       const string & library_name,
133       const path & full_path,   // example: c:/foo/boost/filesystem/path.hpp
134       const string & contents )     // contents of file to be inspected
135     {
136       if (contents.find( "boostinspect:" "nounlinked" ) != string::npos)
137           m_paths[ relative_to( full_path, search_root_path() ) ] |= m_nounlinked_errors;
138 
139       bool no_link_errors =
140           (contents.find( "boostinspect:" "nolink" ) != string::npos);
141 
142       // build bookmarks databases
143       bookmarks.clear();
144       bookmarks_lowercase.clear();
145       string::const_iterator a_start( contents.begin() );
146       string::const_iterator a_end( contents.end() );
147       boost::match_results< string::const_iterator > a_what;
148       boost::match_flag_type a_flags = boost::match_default;
149 
150       if(!is_css(full_path))
151       {
152         string previous_id;
153 
154         while( boost::regex_search( a_start, a_end, a_what, html_bookmark_regex, a_flags) )
155         {
156           // a_what[0] contains the whole string iterators.
157           // a_what[1] contains the tag iterators.
158           // a_what[2] contains the attribute name.
159           // a_what[4] contains the bookmark iterators.
160 
161           if (a_what[4].matched)
162           {
163             string tag( a_what[1].first, a_what[1].second );
164             boost::algorithm::to_lower(tag);
165             string attribute( a_what[2].first, a_what[2].second );
166             boost::algorithm::to_lower(attribute);
167             string bookmark( a_what[4].first, a_what[4].second );
168 
169             bool name_following_id = ( attribute == "name" && previous_id == bookmark );
170             if ( tag != "meta" && attribute == "id" ) previous_id = bookmark;
171             else previous_id.clear();
172 
173             if ( tag != "meta" && !name_following_id )
174             {
175               bookmarks.insert( bookmark );
176 //              std::cout << "******************* " << bookmark << '\n';
177 
178               // w3.org recommends case-insensitive checking for duplicate bookmarks
179               // since some browsers do a case-insensitive match.
180               string bookmark_lowercase( bookmark );
181               boost::algorithm::to_lower(bookmark_lowercase);
182 
183               std::pair<bookmark_set::iterator, bool> result
184                 = bookmarks_lowercase.insert( bookmark_lowercase );
185               if (!result.second)
186               {
187                 ++m_duplicate_bookmark_errors;
188                 int ln = std::count( contents.begin(), a_what[3].first, '\n' ) + 1;
189                 error( library_name, full_path, "Duplicate bookmark: " + bookmark, ln );
190               }
191             }
192           }
193 
194           a_start = a_what[0].second; // update search position
195           a_flags |= boost::match_prev_avail; // update flags
196           a_flags |= boost::match_not_bob;
197         }
198       }
199 
200       // process urls
201       string::const_iterator start( contents.begin() );
202       string::const_iterator end( contents.end() );
203       boost::match_results< string::const_iterator > what;
204       boost::match_flag_type flags = boost::match_default;
205 
206       if(!is_css(full_path))
207       {
208         while( boost::regex_search( start, end, what, html_url_regex, flags) )
209         {
210           // what[0] contains the whole string iterators.
211           // what[1] contains the element type iterators.
212           // what[3] contains the URL iterators.
213 
214           if(what[3].matched)
215           {
216             string type( what[1].first, what[1].second );
217             boost::algorithm::to_lower(type);
218 
219             // TODO: Complain if 'link' tags use external stylesheets.
220             do_url( string( what[3].first, what[3].second ),
221               library_name, full_path, no_link_errors,
222               type == "a" || type == "link", contents.begin(), what[3].first );
223           }
224 
225           start = what[0].second; // update search position
226           flags |= boost::match_prev_avail; // update flags
227           flags |= boost::match_not_bob;
228         }
229       }
230 
231       while( boost::regex_search( start, end, what, css_url_regex, flags) )
232       {
233         // what[0] contains the whole string iterators.
234         // what[2] contains the URL iterators.
235 
236         if(what[2].matched)
237         {
238           do_url( string( what[2].first, what[2].second ),
239             library_name, full_path, no_link_errors, false,
240             contents.begin(), what[3].first );
241         }
242 
243         start = what[0].second; // update search position
244         flags |= boost::match_prev_avail; // update flags
245         flags |= boost::match_not_bob;
246       }
247     }
248 
249 //  do_url  ------------------------------------------------------------------//
250 
do_url(const string & url,const string & library_name,const path & source_path,bool no_link_errors,bool allow_external_content,std::string::const_iterator contents_begin,std::string::const_iterator url_start)251     void link_check::do_url( const string & url, const string & library_name,
252       const path & source_path, bool no_link_errors, bool allow_external_content,
253         std::string::const_iterator contents_begin, std::string::const_iterator url_start )
254         // precondition: source_path.is_complete()
255     {
256       if(!no_link_errors && url.empty()) {
257         ++m_invalid_errors;
258         int ln = std::count( contents_begin, url_start, '\n' ) + 1;
259         error( library_name, source_path, "Empty URL.", ln );
260         return;
261       }
262 
263       // Decode ampersand encoded characters.
264       string decoded_url = is_css(source_path) ? url : decode_ampersands(url);
265       if(decoded_url.empty()) {
266         if(!no_link_errors) {
267           ++m_invalid_errors;
268           int ln = std::count( contents_begin, url_start, '\n' ) + 1;
269           error( library_name, source_path,
270             "Invalid URL (invalid ampersand encodings): " + url, ln );
271         }
272         return;
273       }
274 
275       boost::smatch m;
276       if(!boost::regex_match(decoded_url, m, url_decompose_regex)) {
277         if(!no_link_errors) {
278           ++m_invalid_errors;
279           int ln = std::count( contents_begin, url_start, '\n' ) + 1;
280           error( library_name, source_path, "Invalid URL: " + decoded_url, ln );
281         }
282         return;
283       }
284 
285       bool scheme_matched = m[2].matched,
286         authority_matched = m[4].matched,
287         //query_matched = m[7].matched,
288         fragment_matched = m[9].matched;
289 
290       std::string scheme(m[2]),
291         authority(m[4]),
292         url_path(m[5]),
293         //query(m[7]),
294         fragment(m[9]);
295 
296       // Check for external content
297       if(!allow_external_content && (authority_matched || scheme_matched)) {
298         if(!no_link_errors) {
299           ++m_invalid_errors;
300           int ln = std::count( contents_begin, url_start, '\n' ) + 1;
301           error( library_name, source_path, "External content: " + decoded_url, ln );
302         }
303       }
304 
305       // Protocol checks
306       if(scheme_matched) {
307         if(scheme == "http" || scheme == "https") {
308           // All http links should have a hostname. Generally if they don't
309           // it's by mistake. If they shouldn't, then a protocol isn't
310           // required.
311           if(!authority_matched) {
312             if(!no_link_errors) {
313               ++m_invalid_errors;
314               int ln = std::count( contents_begin, url_start, '\n' ) + 1;
315               error( library_name, source_path, "No hostname: " + decoded_url, ln );
316             }
317           }
318 
319           return;
320         }
321         else if(scheme == "file") {
322           if(!no_link_errors) {
323             ++m_invalid_errors;
324             int ln = std::count( contents_begin, url_start, '\n' ) + 1;
325             error( library_name, source_path,
326               "Invalid URL (hardwired file): " + decoded_url, ln );
327           }
328         }
329         else if(scheme == "mailto" || scheme == "ftp" || scheme == "news" || scheme == "javascript") {
330           if ( !no_link_errors && is_css(source_path) ) {
331             ++m_invalid_errors;
332             int ln = std::count( contents_begin, url_start, '\n' ) + 1;
333             error( library_name, source_path,
334               "Invalid protocol for css: " + decoded_url, ln );
335           }
336         }
337         else {
338           if(!no_link_errors) {
339             ++m_invalid_errors;
340             int ln = std::count( contents_begin, url_start, '\n' ) + 1;
341             error( library_name, source_path, "Unknown protocol: '" + scheme + "' in url: " + decoded_url, ln );
342           }
343         }
344 
345         return;
346       }
347 
348       // Hostname without protocol.
349       if(authority_matched) {
350         if(!no_link_errors) {
351           ++m_invalid_errors;
352           int ln = std::count( contents_begin, url_start, '\n' ) + 1;
353           error( library_name, source_path,
354             "Invalid URL (hostname without protocol): " + decoded_url, ln );
355         }
356       }
357 
358       // Check the fragment identifier
359       if ( fragment_matched ) {
360         if ( is_css(source_path) ) {
361             if ( !no_link_errors ) {
362               ++m_invalid_errors;
363               int ln = std::count( contents_begin, url_start, '\n' ) + 1;
364               error( library_name, source_path,
365                 "Fragment link in CSS: " + decoded_url, ln );
366             }
367         }
368         else {
369           if ( !no_link_errors && fragment.find( '#' ) != string::npos )
370           {
371             ++m_bookmark_errors;
372             int ln = std::count( contents_begin, url_start, '\n' ) + 1;
373             error( library_name, source_path, "Invalid bookmark: " + decoded_url, ln );
374           }
375           else if ( !no_link_errors && url_path.empty() && !fragment.empty()
376             // w3.org recommends case-sensitive broken bookmark checking
377             // since some browsers do a case-sensitive match.
378             && bookmarks.find(decode_percents(fragment)) == bookmarks.end() )
379           {
380             ++m_broken_errors;
381             int ln = std::count( contents_begin, url_start, '\n' ) + 1;
382             error( library_name, source_path, "Unknown bookmark: " + decoded_url, ln );
383           }
384         }
385 
386         // No more to do if it's just a fragment identifier
387         if(url_path.empty()) return;
388       }
389 
390       // Detect characters banned by RFC2396:
391       if ( !no_link_errors && decoded_url.find_first_of( " <>\"{}|\\^[]'" ) != string::npos )
392       {
393         ++m_invalid_errors;
394         int ln = std::count( contents_begin, url_start, '\n' ) + 1;
395         error( library_name, source_path,
396           "Invalid character in URL: " + decoded_url, ln );
397       }
398 
399       // Check that we actually have a path.
400       if(url_path.empty()) {
401         if(!no_link_errors) {
402           ++m_invalid_errors;
403           int ln = std::count( contents_begin, url_start, '\n' ) + 1;
404           error( library_name, source_path,
405             "Invalid URL (empty path in relative url): " + decoded_url, ln );
406         }
407       }
408 
409       // Decode percent encoded characters.
410       string decoded_path = decode_percents(url_path);
411       if(decoded_path.empty()) {
412         if(!no_link_errors) {
413           ++m_invalid_errors;
414           int ln = std::count( contents_begin, url_start, '\n' ) + 1;
415           error( library_name, source_path,
416             "Invalid URL (invalid character encodings): " + decoded_url, ln );
417         }
418         return;
419       }
420 
421       // strip url of references to current dir
422       if ( decoded_path[0]=='.' && decoded_path[1]=='/' ) decoded_path.erase( 0, 2 );
423 
424       // url is relative source_path.branch()
425       // convert to target_path, which is_complete()
426       path target_path;
427       try { target_path = source_path.branch_path() /= path( decoded_path ); }
428       catch ( const fs::filesystem_error & )
429       {
430         if(!no_link_errors) {
431           int ln = std::count( contents_begin, url_start, '\n' ) + 1;
432           ++m_invalid_errors;
433           error( library_name, source_path,
434             "Invalid URL (error resolving path): " + decoded_url, ln );
435         }
436         return;
437       }
438 
439       // create a m_paths entry if necessary
440       std::pair< const string, int > entry(
441         relative_to( target_path, search_root_path() ), 0 );
442       m_path_map::iterator itr( m_paths.find( entry.first ) );
443       if ( itr == m_paths.end() )
444       {
445         if ( fs::exists( target_path ) ) entry.second = m_present;
446         itr = m_paths.insert( entry ).first;
447       }
448 
449       // itr now points to the m_paths entry
450       itr->second |= m_linked_to;
451 
452       // if target isn't present, the link is broken
453       if ( !no_link_errors && (itr->second & m_present) == 0 )
454       {
455         ++m_broken_errors;
456         int ln = std::count( contents_begin, url_start, '\n' ) + 1;
457         error( library_name, source_path, "Broken link: " + decoded_url, ln );
458       }
459     }
460 
461 //  close  -------------------------------------------------------------------//
462 
close()463    void link_check::close()
464    {
465      for ( m_path_map::const_iterator itr = m_paths.begin();
466        itr != m_paths.end(); ++itr )
467      {
468 // std::clog << itr->first << " " << itr->second << "\n";
469        if ( (itr->second & m_linked_to) != m_linked_to
470          && (itr->second & m_nounlinked_errors) != m_nounlinked_errors
471          && (itr->first.rfind( ".html" ) == itr->first.size()-5
472           || itr->first.rfind( ".htm" ) == itr->first.size()-4
473           || itr->first.rfind( ".css" ) == itr->first.size()-4)
474          // because they may be redirectors, it is OK if these are unlinked:
475          && itr->first.rfind( "index.html" ) == string::npos
476          && itr->first.rfind( "index.htm" ) == string::npos )
477        {
478          ++m_unlinked_errors;
479          path full_path( search_root_path() / path(itr->first) );
480          error( impute_library( full_path ), full_path, "Unlinked file" );
481        }
482      }
483    }
484 
485   } // namespace inspect
486 } // namespace boost
487 
488