1----------------------------------------------------------------------------- 2-- Little program that checks links in HTML files, using coroutines and 3-- non-blocking I/O via the dispatcher module. 4-- LuaSocket sample files 5-- Author: Diego Nehab 6----------------------------------------------------------------------------- 7local url = require("socket.url") 8local dispatch = require("dispatch") 9local http = require("socket.http") 10dispatch.TIMEOUT = 10 11 12-- make sure the user knows how to invoke us 13arg = arg or {} 14if table.getn(arg) < 1 then 15 print("Usage:\n luasocket check-links.lua [-n] {<url>}") 16 exit() 17end 18 19-- '-n' means we are running in non-blocking mode 20if arg[1] == "-n" then 21 -- if non-blocking I/O was requested, use real dispatcher interface 22 table.remove(arg, 1) 23 handler = dispatch.newhandler("coroutine") 24else 25 -- if using blocking I/O, use fake dispatcher interface 26 handler = dispatch.newhandler("sequential") 27end 28 29local nthreads = 0 30 31-- get the status of a URL using the dispatcher 32function getstatus(link) 33 local parsed = url.parse(link, {scheme = "file"}) 34 if parsed.scheme == "http" then 35 nthreads = nthreads + 1 36 handler:start(function() 37 local r, c, h, s = http.request{ 38 method = "HEAD", 39 url = link, 40 create = handler.tcp 41 } 42 if r and c == 200 then io.write('\t', link, '\n') 43 else io.write('\t', link, ': ', tostring(c), '\n') end 44 nthreads = nthreads - 1 45 end) 46 end 47end 48 49function readfile(path) 50 path = url.unescape(path) 51 local file, error = io.open(path, "r") 52 if file then 53 local body = file:read("*a") 54 file:close() 55 return body 56 else return nil, error end 57end 58 59function load(u) 60 local parsed = url.parse(u, { scheme = "file" }) 61 local body, headers, code, error 62 local base = u 63 if parsed.scheme == "http" then 64 body, code, headers = http.request(u) 65 if code == 200 then 66 -- if there was a redirect, update base to reflect it 67 base = headers.location or base 68 end 69 if not body then 70 error = code 71 end 72 elseif parsed.scheme == "file" then 73 body, error = readfile(parsed.path) 74 else error = string.format("unhandled scheme '%s'", parsed.scheme) end 75 return base, body, error 76end 77 78function getlinks(body, base) 79 -- get rid of comments 80 body = string.gsub(body, "%<%!%-%-.-%-%-%>", "") 81 local links = {} 82 -- extract links 83 body = string.gsub(body, '[Hh][Rr][Ee][Ff]%s*=%s*"([^"]*)"', function(href) 84 table.insert(links, url.absolute(base, href)) 85 end) 86 body = string.gsub(body, "[Hh][Rr][Ee][Ff]%s*=%s*'([^']*)'", function(href) 87 table.insert(links, url.absolute(base, href)) 88 end) 89 string.gsub(body, "[Hh][Rr][Ee][Ff]%s*=%s*(.-)>", function(href) 90 table.insert(links, url.absolute(base, href)) 91 end) 92 return links 93end 94 95function checklinks(address) 96 local base, body, error = load(address) 97 if not body then print(error) return end 98 print("Checking ", base) 99 local links = getlinks(body, base) 100 for _, link in ipairs(links) do 101 getstatus(link) 102 end 103end 104 105for _, address in ipairs(arg) do 106 checklinks(url.absolute("file:", address)) 107end 108 109while nthreads > 0 do 110 handler:step() 111end 112