1#!/usr/bin/python 2from __future__ import print_function 3 4import os 5import re 6import sys 7import os.path 8import logging 9 10from urllib.parse import urlparse, urlunparse 11from html.parser import HTMLParser, HTMLParseError 12from optparse import OptionParser 13from contextlib import closing 14 15from .warctools import WarcRecord, expand_files 16from .httptools import RequestMessage, ResponseMessage 17 18 19LEVELS = {'debug': logging.DEBUG, 20 'info': logging.INFO, 21 'warning': logging.WARNING, 22 'error': logging.ERROR, 23 'critical': logging.CRITICAL} 24 25parser = OptionParser(usage="%prog [options] warc (warc ...)") 26 27parser.add_option("-L", "--log-level", dest="log_level") 28 29parser.set_defaults(log_level="info") 30 31 32 33def parse_http_response(record): 34 message = ResponseMessage(RequestMessage()) 35 remainder = message.feed(record.content[1]) 36 message.close() 37 if remainder or not message.complete(): 38 if remainder: 39 logging.warning('trailing data in http response for %s'% record.url) 40 if not message.complete(): 41 logging.warning('truncated http response for %s'%record.url) 42 43 header = message.header 44 45 mime_type = [v for k,v in header.headers if k.lower() =='content-type'] 46 if mime_type: 47 mime_type = mime_type[0].split(';')[0] 48 else: 49 mime_type = None 50 51 return header.code, mime_type, message 52 53 54def extract_links_from_warcfh(fh): 55 for (offset, record, errors) in fh.read_records(limit=None): 56 if record: 57 try: 58 content_type, content = record.content 59 60 if record.type == WarcRecord.RESPONSE and content_type.startswith('application/http'): 61 62 code, mime_type, message = parse_http_response(record) 63 64 if 200 <= code < 300 and mime_type.find('html') > -1: 65 for link in extract_links_from_html(record.url, message.get_body()): 66 yield ("".join(c for c in link if c not in '\n\r\t')) 67 68 69 except Exception as e: 70 logging.warning("error in handling record "+str(e)) 71 import traceback; traceback.print_exc() 72 73 elif errors: 74 logging.warning("warc error at %d: %s"%((offset if offset else 0), ", ".join(str(e) for e in errors))) 75 import traceback; traceback.print_exc() 76 77 78 79try: 80 import lxml.html 81 82 def extract_links_from_html(base, body): 83 try: 84 html = lxml.html.fromstring(body) 85 html.make_links_absolute(base) 86 87 for element, attribute, link, pos in html.iterlinks(): 88 if isinstance(link, str): 89 link = link.encode('utf-8', 'ignore') 90 yield link 91 92 except Exception: 93 logging.warning("(lxml) html parse error") 94 import traceback; traceback.print_exc() 95 96 97except ImportError: 98 logging.warning("using fallback parser") 99 def extract_links_from_html(base, body): 100 try: 101 html = LinkParser(base) 102 html.feed(body) 103 html.close() 104 for link in html.get_abs_links(): 105 yield link 106 except HTMLParseError as ex: 107 logging.warning("html parse error") 108 109 110""" fallback link extractor """ 111def attr_extractor(*names): 112 def _extractor(attrs): 113 return [value for key,value in attrs if key in names and value] 114 return _extractor 115 116def meta_extractor(attrs): 117 content = [value for key,value in attrs if key =="content" and value] 118 urls = [] 119 for value in content: 120 for pair in value.split(";"): 121 bits = pair.split("=",2) 122 if len(bits)>1 and bits[0].lower()=="url": 123 urls.append(bits[1].strip()) 124 return urls 125 126 127class LinkParser(HTMLParser): 128 def __init__(self, base): 129 HTMLParser.__init__(self) 130 self.links = [] 131 self.base = base 132 133 self.tag_extractor = { 134 "a": attr_extractor("href"), 135 "applet": attr_extractor("code"), 136 "area": attr_extractor("href"), 137 "bgsound": attr_extractor("src"), 138 "body": attr_extractor("background"), 139 "embed": attr_extractor("href","src"), 140 "fig": attr_extractor("src"), 141 "form": attr_extractor("action"), 142 "frame": attr_extractor("src"), 143 "iframe": attr_extractor("src"), 144 "img": attr_extractor("href","src","lowsrc"), 145 "input": attr_extractor("src"), 146 "link": attr_extractor("href"), 147 "layer": attr_extractor("src"), 148 "object": attr_extractor("data"), 149 "overlay": attr_extractor("src"), 150 "script": attr_extractor("src"), 151 "table": attr_extractor("background"), 152 "td": attr_extractor("background"), 153 "th": attr_extractor("background"), 154 155 "meta": meta_extractor, 156 "base": self.base_extractor, 157 } 158 159 def base_extractor(self, attrs): 160 base = [value for key,value in attrs if key == "href" and value] 161 if base: 162 self.base = base[-1] 163 return () 164 165 def handle_starttag(self, tag, attrs): 166 extractor = self.tag_extractor.get(tag, None) 167 if extractor: 168 self.links.extend(extractor(attrs)) 169 170 def get_abs_links(self): 171 full_urls = [] 172 root = urlparse(self.base) 173 root_dir = os.path.split(root.path)[0] 174 for link in self.links: 175 parsed = urlparse(link) 176 if not parsed.netloc: # does it have no protocol or host, i.e relative 177 if parsed.path.startswith("/"): 178 parsed = root[0:2] + parsed[2:5] + (None,) 179 else: 180 dir = root_dir 181 path = parsed.path 182 while True: 183 if path.startswith("../"): 184 path=path[3:] 185 dir=os.path.split(dir)[0] 186 elif path.startswith("./"): 187 path=path[2:] 188 else: 189 break 190 191 parsed = root[0:2] + (os.path.join(dir, path),) + parsed[3:5] + (None,) 192 new_link = urlunparse(parsed) 193 logging.debug("relative %s -> %s"%(link, new_link)) 194 link=new_link 195 196 else: 197 logging.debug("absolute %s"%link) 198 full_urls.append(link) 199 return full_urls 200 201 202def main(argv): 203 (options, warcs) = parser.parse_args(args=argv[1:]) 204 logging.basicConfig(level=LEVELS[options.log_level]) 205 206 if len(warcs) < 1: 207 parser.error("missing warcs(s)") 208 209 210 ret = 0 211 212 for warc in expand_files(warcs): 213 try: 214 with closing(WarcRecord.open_archive(filename=warc, gzip="auto")) as fh: 215 for link in extract_links_from_warcfh(fh): 216 print(link) 217 218 except Exception as e: 219 logging.error(str(e)) 220 ret -=1 221 222 return ret 223 224 225def run(): 226 sys.exit(main(sys.argv)) 227 228 229if __name__ == '__main__': 230 run() 231 232 233