1#!/usr/bin/env python
2# Copyright 2009 Google Inc. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#      http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Subdomain data parser for Alexa."""
17
18__author__ = 'tstromberg@google.com (Thomas Stromberg)'
19
20import glob
21import operator
22import os
23import os.path
24import re
25import sys
26import time
27
28if __name__ == '__main__':
29  sys.path.append('..')
30
31# See if a third_party library exists -- use it if so.
32try:
33  import third_party
34except ImportError:
35  pass
36
37
38import httplib2
39
40CACHE_DIR = os.getenv('HOME') + '/.alexa_cache'
41CACHE_EXPIRATION = 86400 * 90
42SLEEPY_TIME = 15
43MAX_ATTEMPTS = 5
44
45NAKED_DOMAINS = ['twitter.com', 'rapidshare.com', 'perezhilton.com', 'posterous.com']
46
47def FetchUrl(url, attempts=0):
48  attempts += 1
49  print >> sys.stderr, "Fetching %s (attempt %s)" % (url, attempts)
50  h = httplib2.Http(CACHE_DIR, timeout=10)
51  try:
52    resp, content = h.request(url, 'GET', headers={'cache_control': 'max-age=%s' % CACHE_EXPIRATION})
53  except:
54    if attempts < MAX_ATTEMPTS:
55      print >> sys.stderr, "Will try again..."
56      time.sleep(SLEEPY_TIME)
57      return FetchUrl(url, attempts=attempts)
58
59  time.sleep(SLEEPY_TIME)
60  return content
61
62def FetchCachedAlexaPage(domain):
63  url_path = 'www.alexa.com/siteinfo/%s' % domain
64  cache_path = '%s/%s' % (CACHE_DIR, url_path.replace('/', ','))
65  for file in glob.glob("%s,*" % cache_path):
66    f = open(file)
67    return f.read()
68
69  # If we haven't returned already...
70  return FetchUrl("http://%s" % url_path)
71
72def ParseAlexaSubdomains(content):
73  return re.findall('\<p class=\"tc1.*?>([\w\.-]+\.[\w]{2,})\<\/p>.*?tc1.*?(\d+\.\d+)%', content, re.M | re.S)
74
75def GetHostsForDomain(domain):
76  content = FetchCachedAlexaPage(domain)
77  return ParseAlexaSubdomains(content)
78
79if __name__ == '__main__':
80  index = 0
81  results = {}
82
83  for domain in sys.stdin:
84    index += 1
85    domain = domain.rstrip()
86    for host, percentage in GetHostsForDomain(domain):
87      if host == domain and domain not in NAKED_DOMAINS:
88        host = '.'.join(('www', domain))
89      if percentage == '0.0':
90        continue
91      score = index / (float(percentage) / 100)
92      if host not in results:
93        results[host] = score
94      print >> sys.stderr, "%s: %s (%s)" % (score, host, percentage)
95
96  for host, score in sorted(results.items(), key=operator.itemgetter(1)):
97    print "A %s." % host
98
99
100