1#!/usr/bin/php
2
3<?php
4/***********************************************************
5 mktop1k.php
6 Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
7
8 This program is free software; you can redistribute it and/or
9 modify it under the terms of the GNU General Public License
10 version 2 as published by the Free Software Foundation.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License along
18 with this program; if not, write to the Free Software Foundation, Inc.,
19 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
20 ***********************************************************/
21
22/**
23 * mktop1k: extract the top 1000 Freshmeat projects from the rdf into a file.
24 *
25 * mktop1k makes no attempt to create unique output file names. You have been
26 * warned.
27 *
28 * @param string $in-file path to uncompressed FM rdf xml file
29 * @param string $out-file output file name. Will use cwd if no path supplied.
30 *
31 *
32 * @package mktop1k
33 * @author mark.donohoe@hp.com
34 * @version 0.3
35 *
36 */
37
38// FIXME: this should bet a global from pathinclude? $LIBDIR = '/usr/local/lib';
39require_once("FIXMETOBERELATIVE/pathinclude.php");
40require_once("$LIBDIR/lib_projxml.h.php");
41//require_once("./lib_projxml.h.php");            // dev copy
42
43
44$usage = <<< USAGE
45Usage: mktop1k [-h] -i <in-file> -o <out-file> [-n nnn]
46   Where: -h optional help, displays this message
47          <in-file> path to an uncompressed Freshmeat rdf XML file
48          <out-file> path to filename where the xml output will be generated.
49          -n nnn optional parameter to indicate how many projects to
50             extract.
51
52             Default is 1000.
53
54             The projects are always extracted in priority order.
55             For example, -n 10 will get the top 10 Freshmeat packages.
56             A range of numbers is not supported.
57
58USAGE;
59
60if ($argc <= 4) {
61  echo $usage;
62  exit(1);
63}
64
65// default number of projects to get.
66$HowMany_projects = 1000;
67
68for ($i = 1; $i < $argc; $i++) {
69  switch ($argv[$i]) {
70    case '-i':
71      $i++;
72      if (isset($argv[$i])) {
73        $in_file = $argv[$i];
74      }
75      else {
76        die("ERROR: Must specify an uncompressed filename after -i");
77      }
78      break;
79    case '-h':
80      echo $usage;
81      exit(0);
82      break;
83    case '-n':
84      $i++;
85      if (isset($argv[$i])) {
86        $HowMany_projects = (int) $argv[$i];
87      }
88      else {
89        die("ERROR: Must specify a number between 1-1000 after -n");
90      }
91      break;
92    case '-o':
93      $i++;
94      if (isset($argv[$i])) {
95        $out_file = $argv[$i];
96      }
97      else {
98        die("ERROR: Must specify an uncompressed filename after -o");
99      }
100      break;
101    default:
102      die("ERROR: Unknown argument: $argv[$i]\n$usage");
103      break;
104  }
105}
106
107$F1 = fopen("$in_file", 'r') or die("can't open file: $php_errormsg\n");
108
109/* look for the top 1000 projects, when found, write the project
110 entry to a file.
111
112 NOTE: I'm bothered by something here... while one gets the top
113 1000, there could be drastic differences (not likely between any two
114 days, but possible)....It doesn't really affect this code, but could
115 affect users of the output files.
116 */
117
118$Output = fopen("$out_file", 'w') or die("Can' open: $php_errormsg\n");
119
120echo "Extracting the top $HowMany_projects projects from:\n$in_file\n";
121echo "\nWriting the top $HowMany_projects projects to: $out_file\n";
122
123// need a valid doc, write the header 1st, and open tag
124write_hdr($Output);
125
126while(false != ($line = fgets($F1, 1024))) {
127  #  echo "Line is:\n$line\n";
128
129  if (preg_match('/<project>/', $line)) {
130    $proj_mark = ftell($F1);
131  }
132  elseif (preg_match('/<popularity_rank>[0-9].*</', $line)) {
133    $pos = strpos($line, '>');
134    $rank_pos = $pos + 1;
135    $rank_end = strpos($line, '</', $rank_pos);
136    $rank_len = $rank_end - $rank_pos;
137    $rank = substr($line, $rank_pos, $rank_len);
138    if ((int)$rank <= $HowMany_projects){
139      //pdbg("Processing rank:$rank");
140      write_entry($F1, $proj_mark, $Output);
141    }
142  }
143
144}
145
146// write the end tag and close up shop
147
148close_tag($Output);
149fclose($F1);
150fclose($Output);
151
152echo "Done\n";
153
154?>
155