1#!/bin/sh 2# Copyright 2009 Google Inc. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15# 16# 17# 18# This script is provided to update the data/top-10000.txt file. 19 20CSV='top-1m.csv' 21 22# TODO(tstromberg): Replace this bad hack. In real-world observations, hosts 23# only use 1-2 Google TLD's. Clean all but the real one to match reality. 24REMOVE='google\.[a-z][a-z]$|google\.co\.|google\.com\.[a-z][a-z]' 25ALEXA_URL=http://s3.amazonaws.com/alexa-static/$CSV.zip 26TOP_COUNT=10000 27UNIQ_COUNT=14000 28OUTPUT=alexa-top-${TOP_COUNT}-global.txt 29 30if [ ! -f "$CSV" ]; then 31 if [ ! -f "${CSV}.zip" ]; then 32 echo "${CSV}.zip not found - Fetching $ALEXA_URL" 33 curl -O $ALEXA_URL 34 fi 35 unzip -o $CSV.zip $CSV 36fi 37 38rm $OUTPUT 39cut -d, -f2 $CSV | cut -d/ -f1 | head -$UNIQ_COUNT | ./ordered-uniq.py | \ 40 egrep -v $REMOVE | head -$TOP_COUNT > $OUTPUT 41ls -la $OUTPUT 42 43 44 45