1#!/bin/sh
2# Copyright 2009 Google Inc. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#      http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15#
16#
17#
18# This script is provided to update the data/top-10000.txt file.
19
20CSV='top-1m.csv'
21
22# TODO(tstromberg): Replace this bad hack. In real-world observations, hosts
23# only use 1-2 Google TLD's. Clean all but the real one to match reality.
24REMOVE='google\.[a-z][a-z]$|google\.co\.|google\.com\.[a-z][a-z]'
25ALEXA_URL=http://s3.amazonaws.com/alexa-static/$CSV.zip
26TOP_COUNT=10000
27UNIQ_COUNT=14000
28OUTPUT=alexa-top-${TOP_COUNT}-global.txt
29
30if [ ! -f "$CSV" ]; then
31  if [ ! -f "${CSV}.zip" ]; then
32    echo "${CSV}.zip not found - Fetching $ALEXA_URL"
33    curl -O $ALEXA_URL
34  fi
35  unzip -o $CSV.zip $CSV
36fi
37
38rm $OUTPUT
39cut -d, -f2 $CSV | cut -d/ -f1 | head -$UNIQ_COUNT | ./ordered-uniq.py | \
40  egrep -v $REMOVE | head -$TOP_COUNT > $OUTPUT
41ls -la $OUTPUT
42
43
44
45