1#!/usr/bin/python
2from __future__ import print_function
3
4import os
5import re
6import sys
7import os.path
8import logging
9
10from urllib.parse import urlparse, urlunparse
11from html.parser import HTMLParser, HTMLParseError
12from optparse import OptionParser
13from contextlib import closing
14
15from .warctools import WarcRecord, expand_files
16from .httptools import RequestMessage, ResponseMessage
17
18
19LEVELS = {'debug': logging.DEBUG,
20          'info': logging.INFO,
21          'warning': logging.WARNING,
22          'error': logging.ERROR,
23          'critical': logging.CRITICAL}
24
25parser = OptionParser(usage="%prog [options] warc (warc ...)")
26
27parser.add_option("-L", "--log-level", dest="log_level")
28
29parser.set_defaults(log_level="info")
30
31
32
33def parse_http_response(record):
34    message = ResponseMessage(RequestMessage())
35    remainder = message.feed(record.content[1])
36    message.close()
37    if remainder or not message.complete():
38        if remainder:
39            logging.warning('trailing data in http response for %s'% record.url)
40        if not message.complete():
41            logging.warning('truncated http response for %s'%record.url)
42
43    header = message.header
44
45    mime_type = [v for k,v in header.headers if k.lower() =='content-type']
46    if mime_type:
47        mime_type = mime_type[0].split(';')[0]
48    else:
49        mime_type = None
50
51    return header.code, mime_type, message
52
53
54def extract_links_from_warcfh(fh):
55    for (offset, record, errors) in fh.read_records(limit=None):
56        if record:
57            try:
58                content_type, content = record.content
59
60                if record.type == WarcRecord.RESPONSE and content_type.startswith('application/http'):
61
62                    code, mime_type, message = parse_http_response(record)
63
64                    if 200 <= code < 300 and mime_type.find('html') > -1:
65                        for link in extract_links_from_html(record.url, message.get_body()):
66                            yield ("".join(c for c in link if c not in '\n\r\t'))
67
68
69            except Exception as e:
70                logging.warning("error in handling record "+str(e))
71                import traceback; traceback.print_exc()
72
73        elif errors:
74            logging.warning("warc error at %d: %s"%((offset if offset else 0), ", ".join(str(e) for e in errors)))
75            import traceback; traceback.print_exc()
76
77
78
79try:
80    import lxml.html
81
82    def extract_links_from_html(base, body):
83        try:
84            html = lxml.html.fromstring(body)
85            html.make_links_absolute(base)
86
87            for element, attribute, link, pos in html.iterlinks():
88                if isinstance(link, str):
89                    link = link.encode('utf-8', 'ignore')
90                yield link
91
92        except Exception:
93            logging.warning("(lxml) html parse error")
94            import traceback; traceback.print_exc()
95
96
97except ImportError:
98    logging.warning("using fallback parser")
99    def extract_links_from_html(base, body):
100        try:
101            html = LinkParser(base)
102            html.feed(body)
103            html.close()
104            for link in html.get_abs_links():
105                yield link
106        except HTMLParseError as ex:
107            logging.warning("html parse error")
108
109
110""" fallback link extractor """
111def attr_extractor(*names):
112        def _extractor(attrs):
113            return [value for key,value in attrs if key in names and value]
114        return _extractor
115
116def meta_extractor(attrs):
117    content = [value for key,value in attrs if key =="content" and value]
118    urls = []
119    for value in content:
120        for pair in value.split(";"):
121            bits = pair.split("=",2)
122            if len(bits)>1 and bits[0].lower()=="url":
123                urls.append(bits[1].strip())
124    return urls
125
126
127class LinkParser(HTMLParser):
128    def __init__(self, base):
129        HTMLParser.__init__(self)
130        self.links = []
131        self.base = base
132
133        self.tag_extractor = {
134            "a": attr_extractor("href"),
135            "applet": attr_extractor("code"),
136            "area": attr_extractor("href"),
137            "bgsound": attr_extractor("src"),
138            "body": attr_extractor("background"),
139            "embed": attr_extractor("href","src"),
140            "fig": attr_extractor("src"),
141            "form": attr_extractor("action"),
142            "frame": attr_extractor("src"),
143            "iframe": attr_extractor("src"),
144            "img": attr_extractor("href","src","lowsrc"),
145            "input": attr_extractor("src"),
146            "link": attr_extractor("href"),
147            "layer": attr_extractor("src"),
148            "object": attr_extractor("data"),
149            "overlay": attr_extractor("src"),
150            "script": attr_extractor("src"),
151            "table": attr_extractor("background"),
152            "td": attr_extractor("background"),
153            "th": attr_extractor("background"),
154
155            "meta": meta_extractor,
156            "base": self.base_extractor,
157        }
158
159    def base_extractor(self, attrs):
160        base = [value for key,value in attrs if key == "href" and value]
161        if base:
162            self.base = base[-1]
163        return ()
164
165    def handle_starttag(self, tag, attrs):
166        extractor = self.tag_extractor.get(tag, None)
167        if extractor:
168            self.links.extend(extractor(attrs))
169
170    def get_abs_links(self):
171        full_urls = []
172        root = urlparse(self.base)
173        root_dir = os.path.split(root.path)[0]
174        for link in self.links:
175            parsed = urlparse(link)
176            if not parsed.netloc: # does it have no protocol or host, i.e relative
177                if parsed.path.startswith("/"):
178                    parsed = root[0:2] + parsed[2:5] + (None,)
179                else:
180                    dir = root_dir
181                    path = parsed.path
182                    while True:
183                        if path.startswith("../"):
184                            path=path[3:]
185                            dir=os.path.split(dir)[0]
186                        elif path.startswith("./"):
187                            path=path[2:]
188                        else:
189                            break
190
191                    parsed = root[0:2] + (os.path.join(dir, path),) + parsed[3:5] + (None,)
192                new_link = urlunparse(parsed)
193                logging.debug("relative %s -> %s"%(link, new_link))
194                link=new_link
195
196            else:
197                logging.debug("absolute %s"%link)
198            full_urls.append(link)
199        return full_urls
200
201
202def main(argv):
203    (options, warcs) = parser.parse_args(args=argv[1:])
204    logging.basicConfig(level=LEVELS[options.log_level])
205
206    if len(warcs) < 1:
207        parser.error("missing warcs(s)")
208
209
210    ret = 0
211
212    for warc in expand_files(warcs):
213        try:
214            with closing(WarcRecord.open_archive(filename=warc, gzip="auto")) as fh:
215                for link in extract_links_from_warcfh(fh):
216                    print(link)
217
218        except Exception as e:
219            logging.error(str(e))
220            ret -=1
221
222    return ret
223
224
225def run():
226    sys.exit(main(sys.argv))
227
228
229if __name__ == '__main__':
230    run()
231
232
233