1#!/usr/bin/php 2 3<?php 4/*********************************************************** 5 mktop1k.php 6 Copyright (C) 2007 Hewlett-Packard Development Company, L.P. 7 8 This program is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License 10 version 2 as published by the Free Software Foundation. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program; if not, write to the Free Software Foundation, Inc., 19 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 20 ***********************************************************/ 21 22/** 23 * mktop1k: extract the top 1000 Freshmeat projects from the rdf into a file. 24 * 25 * mktop1k makes no attempt to create unique output file names. You have been 26 * warned. 27 * 28 * @param string $in-file path to uncompressed FM rdf xml file 29 * @param string $out-file output file name. Will use cwd if no path supplied. 30 * 31 * 32 * @package mktop1k 33 * @author mark.donohoe@hp.com 34 * @version 0.3 35 * 36 */ 37 38// FIXME: this should bet a global from pathinclude? $LIBDIR = '/usr/local/lib'; 39require_once("FIXMETOBERELATIVE/pathinclude.php"); 40require_once("$LIBDIR/lib_projxml.h.php"); 41//require_once("./lib_projxml.h.php"); // dev copy 42 43 44$usage = <<< USAGE 45Usage: mktop1k [-h] -i <in-file> -o <out-file> [-n nnn] 46 Where: -h optional help, displays this message 47 <in-file> path to an uncompressed Freshmeat rdf XML file 48 <out-file> path to filename where the xml output will be generated. 49 -n nnn optional parameter to indicate how many projects to 50 extract. 51 52 Default is 1000. 53 54 The projects are always extracted in priority order. 55 For example, -n 10 will get the top 10 Freshmeat packages. 56 A range of numbers is not supported. 57 58USAGE; 59 60if ($argc <= 4) { 61 echo $usage; 62 exit(1); 63} 64 65// default number of projects to get. 66$HowMany_projects = 1000; 67 68for ($i = 1; $i < $argc; $i++) { 69 switch ($argv[$i]) { 70 case '-i': 71 $i++; 72 if (isset($argv[$i])) { 73 $in_file = $argv[$i]; 74 } 75 else { 76 die("ERROR: Must specify an uncompressed filename after -i"); 77 } 78 break; 79 case '-h': 80 echo $usage; 81 exit(0); 82 break; 83 case '-n': 84 $i++; 85 if (isset($argv[$i])) { 86 $HowMany_projects = (int) $argv[$i]; 87 } 88 else { 89 die("ERROR: Must specify a number between 1-1000 after -n"); 90 } 91 break; 92 case '-o': 93 $i++; 94 if (isset($argv[$i])) { 95 $out_file = $argv[$i]; 96 } 97 else { 98 die("ERROR: Must specify an uncompressed filename after -o"); 99 } 100 break; 101 default: 102 die("ERROR: Unknown argument: $argv[$i]\n$usage"); 103 break; 104 } 105} 106 107$F1 = fopen("$in_file", 'r') or die("can't open file: $php_errormsg\n"); 108 109/* look for the top 1000 projects, when found, write the project 110 entry to a file. 111 112 NOTE: I'm bothered by something here... while one gets the top 113 1000, there could be drastic differences (not likely between any two 114 days, but possible)....It doesn't really affect this code, but could 115 affect users of the output files. 116 */ 117 118$Output = fopen("$out_file", 'w') or die("Can' open: $php_errormsg\n"); 119 120echo "Extracting the top $HowMany_projects projects from:\n$in_file\n"; 121echo "\nWriting the top $HowMany_projects projects to: $out_file\n"; 122 123// need a valid doc, write the header 1st, and open tag 124write_hdr($Output); 125 126while(false != ($line = fgets($F1, 1024))) { 127 # echo "Line is:\n$line\n"; 128 129 if (preg_match('/<project>/', $line)) { 130 $proj_mark = ftell($F1); 131 } 132 elseif (preg_match('/<popularity_rank>[0-9].*</', $line)) { 133 $pos = strpos($line, '>'); 134 $rank_pos = $pos + 1; 135 $rank_end = strpos($line, '</', $rank_pos); 136 $rank_len = $rank_end - $rank_pos; 137 $rank = substr($line, $rank_pos, $rank_len); 138 if ((int)$rank <= $HowMany_projects){ 139 //pdbg("Processing rank:$rank"); 140 write_entry($F1, $proj_mark, $Output); 141 } 142 } 143 144} 145 146// write the end tag and close up shop 147 148close_tag($Output); 149fclose($F1); 150fclose($Output); 151 152echo "Done\n"; 153 154?> 155