1-----------------------------------------------------------------------------
2-- Little program that checks links in HTML files, using coroutines and
3-- non-blocking I/O via the dispatcher module.
4-- LuaSocket sample files
5-- Author: Diego Nehab
6-----------------------------------------------------------------------------
7local url = require("socket.url")
8local dispatch = require("dispatch")
9local http = require("socket.http")
10dispatch.TIMEOUT = 10
11
12-- make sure the user knows how to invoke us
13arg = arg or {}
14if table.getn(arg) < 1 then
15    print("Usage:\n  luasocket check-links.lua [-n] {<url>}")
16    exit()
17end
18
19-- '-n' means we are running in non-blocking mode
20if arg[1] == "-n" then
21    -- if non-blocking I/O was requested, use real dispatcher interface
22    table.remove(arg, 1)
23    handler = dispatch.newhandler("coroutine")
24else
25    -- if using blocking I/O, use fake dispatcher interface
26    handler = dispatch.newhandler("sequential")
27end
28
29local nthreads = 0
30
31-- get the status of a URL using the dispatcher
32function getstatus(link)
33    local parsed = url.parse(link, {scheme = "file"})
34    if parsed.scheme == "http" then
35        nthreads = nthreads + 1
36        handler:start(function()
37            local r, c, h, s = http.request{
38                method = "HEAD",
39                url = link,
40                create = handler.tcp
41            }
42            if r and c == 200 then io.write('\t', link, '\n')
43            else io.write('\t', link, ': ', tostring(c), '\n') end
44            nthreads = nthreads - 1
45        end)
46    end
47end
48
49function readfile(path)
50    path = url.unescape(path)
51    local file, error = io.open(path, "r")
52    if file then
53        local body = file:read("*a")
54        file:close()
55        return body
56    else return nil, error end
57end
58
59function load(u)
60    local parsed = url.parse(u, { scheme = "file" })
61    local body, headers, code, error
62    local base = u
63    if parsed.scheme == "http" then
64        body, code, headers = http.request(u)
65        if code == 200 then
66            -- if there was a redirect, update base to reflect it
67            base = headers.location or base
68        end
69        if not body then
70            error = code
71        end
72    elseif parsed.scheme == "file" then
73        body, error = readfile(parsed.path)
74    else error = string.format("unhandled scheme '%s'", parsed.scheme) end
75    return base, body, error
76end
77
78function getlinks(body, base)
79    -- get rid of comments
80    body = string.gsub(body, "%<%!%-%-.-%-%-%>", "")
81    local links = {}
82    -- extract links
83    body = string.gsub(body, '[Hh][Rr][Ee][Ff]%s*=%s*"([^"]*)"', function(href)
84        table.insert(links, url.absolute(base, href))
85    end)
86    body = string.gsub(body, "[Hh][Rr][Ee][Ff]%s*=%s*'([^']*)'", function(href)
87        table.insert(links, url.absolute(base, href))
88    end)
89    string.gsub(body, "[Hh][Rr][Ee][Ff]%s*=%s*(.-)>", function(href)
90        table.insert(links, url.absolute(base, href))
91    end)
92    return links
93end
94
95function checklinks(address)
96    local base, body, error = load(address)
97    if not body then print(error) return end
98    print("Checking ", base)
99    local links = getlinks(body, base)
100    for _, link in ipairs(links) do
101        getstatus(link)
102    end
103end
104
105for _, address in ipairs(arg) do
106    checklinks(url.absolute("file:", address))
107end
108
109while nthreads > 0 do
110    handler:step()
111end
112