1#!/usr/bin/env python 2# Copyright 2009 Google Inc. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15 16"""Subdomain data parser for Alexa.""" 17 18__author__ = 'tstromberg@google.com (Thomas Stromberg)' 19 20import glob 21import operator 22import os 23import os.path 24import re 25import sys 26import time 27 28if __name__ == '__main__': 29 sys.path.append('..') 30 31# See if a third_party library exists -- use it if so. 32try: 33 import third_party 34except ImportError: 35 pass 36 37 38import httplib2 39 40CACHE_DIR = os.getenv('HOME') + '/.alexa_cache' 41CACHE_EXPIRATION = 86400 * 90 42SLEEPY_TIME = 15 43MAX_ATTEMPTS = 5 44 45NAKED_DOMAINS = ['twitter.com', 'rapidshare.com', 'perezhilton.com', 'posterous.com'] 46 47def FetchUrl(url, attempts=0): 48 attempts += 1 49 print >> sys.stderr, "Fetching %s (attempt %s)" % (url, attempts) 50 h = httplib2.Http(CACHE_DIR, timeout=10) 51 try: 52 resp, content = h.request(url, 'GET', headers={'cache_control': 'max-age=%s' % CACHE_EXPIRATION}) 53 except: 54 if attempts < MAX_ATTEMPTS: 55 print >> sys.stderr, "Will try again..." 56 time.sleep(SLEEPY_TIME) 57 return FetchUrl(url, attempts=attempts) 58 59 time.sleep(SLEEPY_TIME) 60 return content 61 62def FetchCachedAlexaPage(domain): 63 url_path = 'www.alexa.com/siteinfo/%s' % domain 64 cache_path = '%s/%s' % (CACHE_DIR, url_path.replace('/', ',')) 65 for file in glob.glob("%s,*" % cache_path): 66 f = open(file) 67 return f.read() 68 69 # If we haven't returned already... 70 return FetchUrl("http://%s" % url_path) 71 72def ParseAlexaSubdomains(content): 73 return re.findall('\<p class=\"tc1.*?>([\w\.-]+\.[\w]{2,})\<\/p>.*?tc1.*?(\d+\.\d+)%', content, re.M | re.S) 74 75def GetHostsForDomain(domain): 76 content = FetchCachedAlexaPage(domain) 77 return ParseAlexaSubdomains(content) 78 79if __name__ == '__main__': 80 index = 0 81 results = {} 82 83 for domain in sys.stdin: 84 index += 1 85 domain = domain.rstrip() 86 for host, percentage in GetHostsForDomain(domain): 87 if host == domain and domain not in NAKED_DOMAINS: 88 host = '.'.join(('www', domain)) 89 if percentage == '0.0': 90 continue 91 score = index / (float(percentage) / 100) 92 if host not in results: 93 results[host] = score 94 print >> sys.stderr, "%s: %s (%s)" % (score, host, percentage) 95 96 for host, score in sorted(results.items(), key=operator.itemgetter(1)): 97 print "A %s." % host 98 99 100