1#
2# $Id: IMDB.pm,v 1.69 2015/07/12 00:59:01 knowledgejunkie Exp $
3#
4# The IMDB file contains two packages:
5# 1. XMLTV::IMDB::Cruncher package which parses and manages IMDB "lists" files
6#    from ftp.imdb.com
7# 2. XMLTV::IMDB package that uses data files from the Cruncher package to
8#    update/add details to XMLTV programme nodes.
9#
10# FUTURE - multiple hits on the same 'title only' could try and look for
11#          character names matching from description to imdb.com character
12#          names.
13#
14# FUTURE - multiple hits on 'title only' should probably pick latest
15#          tv series over any older ones. May make for better guesses.
16#
17# BUG - we identify 'presenters' by the word "Host" appearing in the character
18#       description. For some movies, character names include the word Host.
19#       ex. Animal, The (2001) has a character named "Badger Milk Host".
20#
21# BUG - if there is a matching title with > 1 entry (say made for tv-movie and
22#       at tv-mini series) made in the same year (or even "close" years) it is
23#       possible for us to pick the wrong one we should pick the one with the
24#       closest year, not just the first closest match based on the result ordering
25#       for instance Ghost Busters was made in 1984, and into a tv series in
26#       1986. if we have a list of GhostBusters 1983, we should pick the 1984 movie
27#       and not 1986 tv series...maybe :) but currently we'll pick the first
28#       returned close enough match instead of trying the closest date match of
29#       the approx hits.
30#
31
32use strict;
33
34package XMLTV::IMDB;
35
36use open ':encoding(iso-8859-1)';   # try to enforce file encoding (does this work in Perl <5.8.1? )
37
38#
39# HISTORY
40# .6 = what was here for the longest time
41# .7 = fixed file size est calculations
42#    = moviedb.info now includes _file_size_uncompressed values for each downloaded file
43# .8 = updated file size est calculations
44#    = moviedb.dat directors and actors list no longer include repeated names (which mostly
45#      occured in episodic tv programs (reported by Alexy Khrabrov)
46# .9 = added keywords data
47# .10 = added plot data
48#
49our $VERSION = '0.10';      # version number of database
50
51sub new
52{
53    my ($type) = shift;
54    my $self={ @_ };            # remaining args become attributes
55
56    for ('imdbDir', 'verbose') {
57	die "invalid usage - no $_" if ( !defined($self->{$_}));
58    }
59    #$self->{verbose}=2;
60    $self->{replaceDates}=0        if ( !defined($self->{replaceDates}));
61    $self->{replaceTitles}=0       if ( !defined($self->{replaceTitles}));
62    $self->{replaceCategories}=0   if ( !defined($self->{replaceCategories}));
63    $self->{replaceKeywords}=0     if ( !defined($self->{replaceKeywords}));
64    $self->{replaceURLs}=0         if ( !defined($self->{replaceURLs}));
65    $self->{replaceDirectors}=1    if ( !defined($self->{replaceDirectors}));
66    $self->{replaceActors}=0       if ( !defined($self->{replaceActors}));
67    $self->{replacePresentors}=1   if ( !defined($self->{replacePresentors}));
68    $self->{replaceCommentators}=1 if ( !defined($self->{replaceCommentators}));
69    $self->{replaceStarRatings}=0  if ( !defined($self->{replaceStarRatings}));
70    $self->{replacePlot}=0         if ( !defined($self->{replacePlot}));
71
72    $self->{updateDates}=1        if ( !defined($self->{updateDates}));
73    $self->{updateTitles}=1       if ( !defined($self->{updateTitles}));
74    $self->{updateCategories}=1   if ( !defined($self->{updateCategories}));
75    $self->{updateCategoriesWithGenres}=1 if ( !defined($self->{updateCategoriesWithGenres}));
76    $self->{updateKeywords}=0     if ( !defined($self->{updateKeywords}));          # default is to NOT add keywords
77    $self->{updateURLs}=1         if ( !defined($self->{updateURLs}));
78    $self->{updateDirectors}=1    if ( !defined($self->{updateDirectors}));
79    $self->{updateActors}=1       if ( !defined($self->{updateActors}));
80    $self->{updatePresentors}=1   if ( !defined($self->{updatePresentors}));
81    $self->{updateCommentators}=1 if ( !defined($self->{updateCommentators}));
82    $self->{updateStarRatings}=1  if ( !defined($self->{updateStarRatings}));
83    $self->{updatePlot}=0         if ( !defined($self->{updatePlot}));          # default is to NOT add plot
84
85    $self->{numActors}=3          if ( !defined($self->{numActors}));           # default is to add top 3 actors
86
87    $self->{moviedbIndex}="$self->{imdbDir}/moviedb.idx";
88    $self->{moviedbData}="$self->{imdbDir}/moviedb.dat";
89    $self->{moviedbInfo}="$self->{imdbDir}/moviedb.info";
90    $self->{moviedbOffline}="$self->{imdbDir}/moviedb.offline";
91
92    # default is not to cache lookups
93    $self->{cacheLookups}=0 if ( !defined($self->{cacheLookups}) );
94    $self->{cacheLookupSize}=0 if ( !defined($self->{cacheLookupSize}) );
95
96    $self->{cachedLookups}->{tv_series}->{_cacheSize_}=0;
97
98    bless($self, $type);
99
100    $self->{categories}={'movie'          =>'Movie',
101			 'tv_movie'       =>'TV Movie', # made for tv
102			 'video_movie'    =>'Video Movie', # went straight to video or was made for it
103			 'tv_series'      =>'TV Series',
104			 'tv_mini_series' =>'TV Mini Series'};
105
106    $self->{stats}->{programCount}=0;
107
108    for my $cat (keys %{$self->{categories}}) {
109	$self->{stats}->{perfect}->{$cat}=0;
110	$self->{stats}->{close}->{$cat}=0;
111    }
112    $self->{stats}->{perfectMatches}=0;
113    $self->{stats}->{closeMatches}=0;
114
115    $self->{stats}->{startTime}=time();
116
117    return($self);
118}
119
120sub loadDBInfo($)
121{
122    my $file=shift;
123    my $info;
124
125    open(INFO, "< $file") || return("imdbDir index file \"$file\":$!\n");
126    while(<INFO>) {
127	chop();
128	if ( s/^([^:]+)://o ) {
129	    $info->{$1}=$_;
130	}
131    }
132    close(INFO);
133    return($info);
134}
135
136sub checkIndexesOkay($)
137{
138    my $self=shift;
139    if ( ! -d "$self->{imdbDir}" ) {
140	return("imdbDir \"$self->{imdbDir}\" does not exist\n");
141    }
142
143    if ( -f "$self->{moviedbOffline}" ) {
144	return("imdbDir index offline: check $self->{moviedbOffline} for details");
145    }
146
147    for my $file ($self->{moviedbIndex}, $self->{moviedbData}, $self->{moviedbInfo}) {
148	if ( ! -f "$file" ) {
149	    return("imdbDir index file \"$file\" does not exist\n");
150	}
151    }
152
153    $VERSION=~m/^(\d+)\.(\d+)$/o || die "package corrupt, VERSION string invalid ($VERSION)";
154    my ($major, $minor)=($1, $2);
155
156    my $info=loadDBInfo($self->{moviedbInfo});
157    return($info) if ( ref $info eq 'SCALAR' );
158
159    if ( !defined($info->{db_version}) ) {
160	return("imdbDir index db missing version information, rerun --prepStage all\n");
161    }
162    if ( $info->{db_version}=~m/^(\d+)\.(\d+)$/o ) {
163	if ( $1 != $major || $2 < $minor ) {
164	    return("imdbDir index db requires updating, rerun --prepStage all\n");
165	}
166	if ( $1 == 0 && $2 == 1 ) {
167	    return("imdbDir index db requires update, rerun --prepStage 5 (bug:actresses never appear)\n");
168	}
169	if ( $1 == 0 && $2 == 2 ) {
170	    # 0.2 -> 0.3 upgrade requires prepStage 5 to be re-run
171	    return("imdbDir index db requires minor reindexing, rerun --prepStage 3 and 5\n");
172	}
173	if ( $1 == 0 && $2 == 3 ) {
174	    # 0.2 -> 0.3 upgrade requires prepStage 5 to be re-run
175	    return("imdbDir index db requires major reindexing, rerun --prepStage 2 and new prepStages 5,6,7,8 and 9\n");
176	}
177	if ( $1 == 0 && $2 == 4 ) {
178	    # 0.2 -> 0.3 upgrade requires prepStage 5 to be re-run
179	    return("imdbDir index db corrupt (got version 0.4), rerun --prepStage all\n");
180	}
181	# okay
182	return(undef);
183    }
184    else {
185	return("imdbDir index version of '$info->{db_version}' is invalid, rerun --prepStage all\n".
186	       "if problem persists, submit bug report to xmltv-devel\@lists.sf.net\n");
187    }
188}
189
190sub basicVerificationOfIndexes($)
191{
192    my $self=shift;
193
194    # check that the imdbdir is invalid and up and running
195    my $title="Army of Darkness";
196    my $year=1992;
197
198    $self->openMovieIndex() || return("basic verification of indexes failed\n".
199				      "database index isn't readable");
200
201    my $verbose = $self->{verbose}; $self->{verbose} = 0;
202    my $res=$self->getMovieMatches($title, $year);
203    $self->{verbose} = $verbose; undef $verbose;
204    if ( !defined($res) ) {
205	$self->closeMovieIndex();
206	return("basic verification of indexes failed\n".
207	       "no match for basic verification of movie \"$title, $year\"\n");
208    }
209    if ( !defined($res->{exactMatch}) ) {
210	$self->closeMovieIndex();
211	return("basic verification of indexes failed\n".
212	       "no exact match for movie \"$title, $year\"\n");
213    }
214    if ( scalar(@{$res->{exactMatch}})!= 1) {
215	$self->closeMovieIndex();
216	return("basic verification of indexes failed\n".
217	       "got more than one exact match for movie \"$title, $year\"\n");
218    }
219    my @exact=@{$res->{exactMatch}};
220    if ( $exact[0]->{title} ne $title ) {
221	$self->closeMovieIndex();
222	return("basic verification of indexes failed\n".
223	       "title associated with key \"$title, $year\" is bad\n");
224    }
225
226    if ( $exact[0]->{year} ne "$year" ) {
227	$self->closeMovieIndex();
228	return("basic verification of indexes failed\n".
229	       "year associated with key \"$title, $year\" is bad\n");
230    }
231
232    my $id=$exact[0]->{id};
233    $res=$self->getMovieIdDetails($id);
234    if ( !defined($res) ) {
235	$self->closeMovieIndex();
236	return("basic verification of indexes failed\n".
237	       "no movie details for movie \"$title, $year\" (id=$id)\n");
238    }
239
240    if ( !defined($res->{directors}) ) {
241	$self->closeMovieIndex();
242	return("basic verification of indexes failed\n".
243	       "movie details didn't provide any director for movie \"$title, $year\" (id=$id)\n");
244    }
245    if ( !$res->{directors}[0]=~m/Raimi/o ) {
246	$self->closeMovieIndex();
247	return("basic verification of indexes failed\n".
248	       "movie details didn't show Raimi as the main director for movie \"$title, $year\" (id=$id)\n");
249    }
250    if ( !defined($res->{actors}) ) {
251	$self->closeMovieIndex();
252	return("basic verification of indexes failed\n".
253	       "movie details didn't provide any cast movie \"$title, $year\" (id=$id)\n");
254    }
255    if ( !$res->{actors}[0]=~m/Campbell/o ) {
256	$self->closeMovieIndex();
257	return("basic verification of indexes failed\n".
258	       "movie details didn't show Bruce Campbell as the main actor in movie \"$title, $year\" (id=$id)\n");
259    }
260    my $matches=0;
261    for (@{$res->{genres}}) {
262	if ( $_ eq "Action" ||
263	     $_ eq "Comedy" ||
264	     $_ eq "Fantasy" ||
265	     $_ eq "Horror" ||
266	     $_ eq "Romance" ) {
267	    $matches++;
268	}
269    }
270    if ( $matches == 0 ) {
271	$self->closeMovieIndex();
272	return("basic verification of indexes failed\n".
273	       "movie details didn't show genres correctly for movie \"$title, $year\" (id=$id)\n");
274    }
275    if ( !defined($res->{ratingDist}) ||
276	 !defined($res->{ratingVotes}) ||
277	 !defined($res->{ratingRank}) ) {
278	$self->closeMovieIndex();
279	return("basic verification of indexes failed\n".
280	       "movie details didn't show imdbratings for movie \"$title, $year\" (id=$id)\n");
281    }
282    $self->closeMovieIndex();
283    return(undef);
284
285}
286
287sub sanityCheckDatabase($)
288{
289    my $self=shift;
290    my $errline;
291
292    $errline=$self->checkIndexesOkay();
293    return($errline) if ( defined($errline) );
294    $errline=$self->basicVerificationOfIndexes();
295    return($errline) if ( defined($errline) );
296
297    # all okay
298    return(undef);
299}
300
301sub error($$)
302{
303    print STDERR "tv_imdb: $_[1]\n";
304}
305
306sub status($$)
307{
308    if ( $_[0]->{verbose} ) {
309	print STDERR "tv_imdb: $_[1]\n";
310    }
311}
312
313sub debug($$)
314{
315    my $self=shift;
316    my $mess=shift;
317    if ( $self->{verbose} > 1 ) {
318	print STDERR "tv_imdb: $mess\n";
319    }
320}
321
322use Search::Dict;
323
324sub openMovieIndex($)
325{
326    my $self=shift;
327
328    if ( !open($self->{INDEX_FD}, "< $self->{moviedbIndex}") ) {
329	return(undef);
330    }
331    if ( !open($self->{DBASE_FD}, "< $self->{moviedbData}") ) {
332	close($self->{INDEX_FD});
333	return(undef);
334    }
335    return(1);
336}
337
338sub closeMovieIndex($)
339{
340    my $self=shift;
341
342    close($self->{INDEX_FD});
343    delete($self->{INDEX_FD});
344
345    close($self->{DBASE_FD});
346    delete($self->{DBASE_FD});
347
348    return(1);
349}
350
351# moviedbIndex file has the format:
352# title:lineno
353# where key is a url encoded title followed by the year of production and a colon
354sub getMovieMatches($$$)
355{
356    my $self=shift;
357    my $title=shift;
358    my $year=shift;
359
360    # Articles are put at the end of a title ( in all languages )
361    #$match=~s/^(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)\s+(.*)$/$2, $1/og;
362
363    my $match="$title";
364    if ( defined($year) ) {
365	$match.=" ($year)";
366    }
367
368    # to encode s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/oeg
369    # to decode s/%(?:([0-9a-fA-F]{2})|u([0-9a-fA-F]{4}))/defined($1)? chr hex($1) : utf8_chr(hex($2))/oge;
370
371    # url encode
372    $match=lc($match);
373    $match=~s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/oeg;
374
375    $self->debug("looking for \"$match\" in $self->{moviedbIndex}");
376    if ( !$self->{INDEX_FD} ) {
377	die "internal error: index not open";
378    }
379
380    my $FD=$self->{INDEX_FD};
381    Search::Dict::look(*{$FD}, $match, 0, 0);
382    my $results;
383    while (<$FD>) {
384	last if ( !m/^$match/ );
385
386	chop();
387	my @arr=split('\t', $_);
388	if ( scalar(@arr) != 5 ) {
389	    warn "$self->{moviedbIndex} corrupt (correct key:$_)";
390	    next;
391	}
392
393	if ( $arr[0] eq $match ) {
394	    # return title and id
395	    #$arr[1]=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og;
396
397	    #$arr[0]=~s/%(?:([0-9a-fA-F]{2})|u([0-9a-fA-F]{4}))/defined($1)? chr hex($1) : utf8_chr(hex($2))/oge;
398	    #$self->debug("exact:$arr[1] ($arr[2]) qualifier=$arr[3] id=$arr[4]");
399	    my $title=$arr[1];
400	    if ( $title=~s/\s+\((\d\d\d\d|\?\?\?\?)\)$//o ) {
401	    }
402	    elsif ( $title=~s/\s+\((\d\d\d\d|\?\?\?\?)\/[IVX]+\)$//o ) {
403	    }
404	    else {
405		die "unable to decode year from title key \"$title\", report to xmltv-devel\@lists.sf.net";
406	    }
407	    $title=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og;
408	    $self->debug("exact:$title ($arr[2]) qualifier=$arr[3] id=$arr[4]");
409	    push(@{$results->{exactMatch}}, {'key'=> $arr[1],
410					     'title'=>$title,
411					     'year'=>$arr[2],
412					     'qualifier'=>$arr[3],
413					     'id'=>$arr[4]});
414	}
415	else {
416	    # decode
417	    #s/%(?:([0-9a-fA-F]{2})|u([0-9a-fA-F]{4}))/defined($1)? chr hex($1) : utf8_chr(hex($2))/oge;
418	    # return title
419	    #$arr[1]=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og;
420	    #$arr[0]=~s/%(?:([0-9a-fA-F]{2})|u([0-9a-fA-F]{4}))/defined($1)? chr hex($1) : utf8_chr(hex($2))/oge;
421	    #$self->debug("close:$arr[1] ($arr[2]) qualifier=$arr[3] id=$arr[4]");
422	    my $title=$arr[1];
423
424	    if ( $title=~m/^\"/o && $title=~m/\"\s*\(/o ) { #"
425		$title=~s/^\"//o; #"
426		$title=~s/\"(\s*\()/$1/o; #"
427	    }
428
429	    if ( $title=~s/\s+\((\d\d\d\d|\?\?\?\?)\)$//o ) {
430	    }
431	    elsif ( $title=~s/\s+\((\d\d\d\d|\?\?\?\?)\/[IVX]+\)$//o ) {
432	    }
433	    else {
434		die "unable to decode year from title key \"$title\", report to xmltv-devel\@lists.sf.net";
435	    }
436	    $title=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og;
437	    $self->debug("close:$title ($arr[2]) qualifier=$arr[3] id=$arr[4]");
438	    push(@{$results->{closeMatch}}, {'key'=> $arr[1],
439					     'title'=>$title,
440					     'year'=>$arr[2],
441					     'qualifier'=>$arr[3],
442					     'id'=>$arr[4]});
443	}
444    }
445    #print "MovieMatches on ($match) = ".Dumper($results)."\n";
446    return($results);
447}
448
449sub getMovieExactMatch($$$)
450{
451    my $self=shift;
452    my $title=shift;
453    my $year=shift;
454    my $res=$self->getMovieMatches($title, $year);
455
456    return(undef) if ( !defined($res) );
457    if ( !defined($res->{exactMatch}) ) {
458	return(undef);
459    }
460    if ( scalar(@{$res->{exactMatch}}) != 1 ) {
461	return(undef);
462    }
463    return($res->{exactMatch}[0]);
464}
465
466sub getMovieCloseMatches($$)
467{
468    my $self=shift;
469    my $title=shift;
470
471    my $res=$self->getMovieMatches($title, undef) || return(undef);
472
473    if ( defined($res->{exactMatch})) {
474	die "corrupt imdb database - hit on \"$title\"";
475    }
476    return(undef) if ( !defined($res->{closeMatch}) );
477    my @arr=@{$res->{closeMatch}};
478    #print "CLOSE DUMP=".Dumper(@arr)."\n";
479    return(@arr);
480}
481
482sub getMovieIdDetails($$)
483{
484    my $self=shift;
485    my $id=shift;
486
487    if ( !$self->{DBASE_FD} ) {
488	die "internal error: index not open";
489    }
490    my $results;
491    my $FD=$self->{DBASE_FD};
492    Search::Dict::look(*{$FD}, "$id:", 0, 0);
493    while (<$FD>) {
494	last if ( !m/^$id:/ );
495	chop();
496	if ( s/^$id:// ) {
497	    my ($directors, $actors, $genres, $ratingDist, $ratingVotes, $ratingRank, $keywords, $plot)=split('\t', $_);
498	    if ( $directors ne "<>" ) {
499		for my $name (split('\|', $directors)) {
500		    # remove (I) etc from imdb.com names (kept in place for reference)
501		    $name=~s/\s\([IVX]+\)$//o;
502		    # switch name around to be surname last
503		    $name=~s/^([^,]+),\s*(.*)$/$2 $1/o;
504		    push(@{$results->{directors}}, $name);
505		}
506	    }
507	    if ( $actors ne "<>" ) {
508		for my $name (split('\|', $actors)) {
509		    # remove (I) etc from imdb.com names (kept in place for reference)
510		    my $HostNarrator;
511		    if ( $name=~s/\[([^\]]+)\]$//o ) {
512			$HostNarrator=$1;
513		    }
514		    $name=~s/\s\([IVX]+\)$//o;
515
516		    # switch name around to be surname last
517		    $name=~s/^([^,]+),\s*(.*)$/$2 $1/o;
518		    if ( $HostNarrator ) {
519			if ( $HostNarrator=~s/,*Host//o ) {
520			    push(@{$results->{presenter}}, $name);
521			}
522			if ( $HostNarrator=~s/,*Narrator//o ) {
523			    push(@{$results->{commentator}}, $name);
524			}
525		    }
526		    else {
527			push(@{$results->{actors}}, $name);
528		    }
529		}
530	    }
531	    if ( $genres ne "<>" ) {
532		push(@{$results->{genres}}, split('\|', $genres));
533	    }
534	    if ( $keywords ne "<>" ) {
535		push(@{$results->{keywords}}, split(',', $keywords));
536	    }
537	    $results->{ratingDist}=$ratingDist if ( $ratingDist ne "<>" );
538	    $results->{ratingVotes}=$ratingVotes if ( $ratingVotes ne "<>" );
539	    $results->{ratingRank}=$ratingRank if ( $ratingRank ne "<>" );
540	    $results->{plot}=$plot if ( $plot ne "<>" );
541	}
542	else {
543	    warn "lookup of movie (id=$id) resulted in garbage ($_)";
544	}
545    }
546    if ( !defined($results) ) {
547	# some movies we don't have any details for
548	$results->{noDetails}=1;
549    }
550    #print "MovieDetails($id) = ".Dumper($results)."\n";
551    return($results);
552}
553
554#
555# FUTURE - close hit could be just missing or extra
556#          punctuation:
557#       "Run Silent, Run Deep" for imdb's "Run Silent Run Deep"
558#       "Cherry, Harry and Raquel" for imdb's "Cherry, Harry and Raquel!"
559#       "Cat Women of the Moon" for imdb's "Cat-Women of the Moon"
560#       "Baywatch Hawaiian Wedding" for imdb's "Baywatch: Hawaiian Wedding" :)
561#
562# FIXED - "Victoria and Albert" appears for imdb's "Victoria & Albert" (and -> &)
563# FIXED - "Columbo Cries Wolf" appears instead of "Columbo:Columbo Cries Wolf"
564# FIXED - Place the article last, for multiple languages. For instance
565#         Los amantes del c�rculo polar -> amantes del c�rculo polar, Los
566# FIXED - common international vowel changes. For instance
567#          "Anna Kar�nin" (�->e)
568#
569sub alternativeTitles($)
570{
571    my $title=shift;
572    my @titles;
573
574    push(@titles, $title);
575
576    # try the & -> and conversion
577    if ( $title=~m/\&/o ) {
578	my $t=$title;
579	while ( $t=~s/(\s)\&(\s)/$1and$2/o ) {
580	    push(@titles, $t);
581	}
582    }
583
584    # try the and -> & conversion
585    if ( $title=~m/\sand\s/io ) {
586	my $t=$title;
587	while ( $t=~s/(\s)and(\s)/$1\&$2/io ) {
588	    push(@titles, $t);
589	}
590    }
591
592    # try the "Columbo: Columbo cries Wolf" -> "Columbo cries Wolf" conversion
593    my $max=scalar(@titles);
594    for (my $i=0; $i<$max ; $i++) {
595	my $t=$titles[$i];
596        if ( $t=~m/^[^:]+:.+$/io ) {
597            while ( $t=~s/^[^:]+:\s*(.+)\s*$/$1/io ) {
598                push(@titles, $t);
599            }
600        }
601    }
602
603    # Place the articles last
604    $max=scalar(@titles);
605    for (my $i=0; $i<$max ; $i++) {
606	my $t=$titles[$i];
607        if ( $t=~m/^(The|A|Une|Les|Los|Las|L\'|Le|La|El|Das|De|Het|Een)\s+(.*)$/io ) {
608            $t=~s/^(The|A|Une|Les|Los|Las|L\'|Le|La|El|Das|De|Het|Een)\s+(.*)$/$2, $1/iog;
609            push(@titles, $t);
610        }
611        if ( $t=~m/^(.+),\s*(The|A|Une|Les|Los|Las|L\'|Le|La|El|Das|De|Het|Een)$/io ) {
612            $t=~s/^(.+),\s*(The|A|Une|Les|Los|Las|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/iog;
613            push(@titles, $t);
614        }
615    }
616
617    # convert all the special language characters
618    $max=scalar(@titles);
619    for (my $i=0; $i<$max ; $i++) {
620	my $t=$titles[$i];
621	if ( $t=~m/[����������������������������������������������������������]/io ) {
622	    $t=~s/[������������]/a/gio;
623	    $t=~s/[��������]/e/gio;
624	    $t=~s/[��������]/i/gio;
625	    $t=~s/[������������]/o/gio;
626	    $t=~s/[��������]/u/gio;
627	    $t=~s/[��]/ae/gio;
628	    $t=~s/[��]/c/gio;
629	    $t=~s/[��]/n/gio;
630	    $t=~s/[�]/ss/gio;
631	    $t=~s/[���]/y/gio;
632	    $t=~s/[�]//gio;
633	    push(@titles, $t);
634	}
635    }
636
637    # optional later possible titles include removing the '.' from titles
638    # ie "Project V.I.P.E.R." matching imdb "Project VIPER"
639    $max=scalar(@titles);
640    for (my $i=0; $i<$max ; $i++) {
641	my $t=$titles[$i];
642	if ( $t=~s/\.//go ) {
643	    push(@titles,$t);
644	}
645    }
646    return(\@titles);
647}
648
649sub findMovieInfo($$$$)
650{
651    my ($self, $title, $year, $exact)=@_;
652
653    my @titles=@{alternativeTitles($title)};
654
655    if ( $exact == 1 ) {
656	# try an exact match first :)
657	for my $mytitle ( @titles ) {
658	    my $info=$self->getMovieExactMatch($mytitle, $year);
659	    if ( defined($info) ) {
660		if ( $info->{qualifier} eq "movie" ) {
661		    $self->status("perfect hit on movie \"$info->{key}\"");
662		    $info->{matchLevel}="perfect";
663		    return($info);
664		}
665		elsif ( $info->{qualifier} eq "tv_movie" ) {
666		    $self->status("perfect hit on made-for-tv-movie \"$info->{key}\"");
667		    $info->{matchLevel}="perfect";
668		    return($info);
669		}
670		elsif ( $info->{qualifier} eq "video_movie" ) {
671		    $self->status("perfect hit on made-for-video-movie \"$info->{key}\"");
672		    $info->{matchLevel}="perfect";
673		    return($info);
674		}
675		elsif ( $info->{qualifier} eq "video_game" ) {
676		    next;
677		}
678		elsif ( $info->{qualifier} eq "tv_series" ) {
679		}
680		elsif ( $info->{qualifier} eq "tv_mini_series" ) {
681		}
682		else {
683		    $self->error("$self->{moviedbIndex} responded with wierd entry for \"$info->{key}\"");
684		    $self->error("weird trailing qualifier \"$info->{qualifier}\"");
685		    $self->error("submit bug report to xmltv-devel\@lists.sf.net");
686		}
687	    }
688	    $self->debug("no exact title/year hit on \"$mytitle ($year)\"");
689	}
690	return(undef);
691    }
692    elsif ( $exact == 2 ) {
693	# looking for first exact match on the title, don't have a year to compare
694
695	for my $mytitle ( @titles ) {
696	    # try close hit if only one :)
697	    my $cnt=0;
698	    my @closeMatches=$self->getMovieCloseMatches("$mytitle");
699
700	    # we traverse the hits twice, first looking for success,
701	    # then again to produce warnings about missed close matches
702	    for my $info (@closeMatches) {
703		next if ( !defined($info) );
704		$cnt++;
705
706		# within one year with exact match good enough
707		if ( lc($mytitle) eq lc($info->{title}) ) {
708
709		    if ( $info->{qualifier} eq "movie" ) {
710			$self->status("close enough hit on movie \"$info->{key}\" (since no 'date' field present)");
711			$info->{matchLevel}="close";
712			return($info);
713		    }
714		    elsif ( $info->{qualifier} eq "tv_movie" ) {
715			$self->status("close enough hit on made-for-tv-movie \"$info->{key}\" (since no 'date' field present)");
716			$info->{matchLevel}="close";
717			return($info);
718		    }
719		    elsif ( $info->{qualifier} eq "video_movie" ) {
720			$self->status("close enough hit on made-for-video-movie \"$info->{key}\" (since no 'date' field present)");
721			$info->{matchLevel}="close";
722			return($info);
723		    }
724		    elsif ( $info->{qualifier} eq "video_game" ) {
725			next;
726		    }
727		    elsif ( $info->{qualifier} eq "tv_series" ) {
728		    }
729		    elsif ( $info->{qualifier} eq "tv_mini_series" ) {
730		    }
731		    else {
732			$self->error("$self->{moviedbIndex} responded with wierd entry for \"$info->{key}\"");
733			$self->error("weird trailing qualifier \"$info->{qualifier}\"");
734			$self->error("submit bug report to xmltv-devel\@lists.sf.net");
735		    }
736		}
737	    }
738	}
739	# nothing worked
740	return(undef);
741    }
742
743    # otherwise we're looking for a title match with a close year
744    for my $mytitle ( @titles ) {
745	# try close hit if only one :)
746	my $cnt=0;
747	my @closeMatches=$self->getMovieCloseMatches("$mytitle");
748
749	# we traverse the hits twice, first looking for success,
750	# then again to produce warnings about missed close matches
751	for my $info (@closeMatches) {
752	    next if ( !defined($info) );
753	    $cnt++;
754
755	    # within one year with exact match good enough
756	    if ( lc($mytitle) eq lc($info->{title}) ) {
757		my $yearsOff=abs(int($info->{year})-$year);
758
759		$info->{matchLevel}="close";
760
761		if ( $yearsOff <= 2 ) {
762		    my $showYear=int($info->{year});
763
764		    if ( $info->{qualifier} eq "movie" ) {
765			$self->status("close enough hit on movie \"$info->{key}\" (off by $yearsOff years)");
766			return($info);
767		    }
768		    elsif ( $info->{qualifier} eq "tv_movie" ) {
769			$self->status("close enough hit on made-for-tv-movie \"$info->{key}\" (off by $yearsOff years)");
770			return($info);
771		    }
772		    elsif ( $info->{qualifier} eq "video_movie" ) {
773			$self->status("close enough hit on made-for-video-movie \"$info->{key}\" (off by $yearsOff years)");
774			return($info);
775		    }
776		    elsif ( $info->{qualifier} eq "video_game" ) {
777			$self->status("ignoring close hit on video-game \"$info->{key}\"");
778			next;
779		    }
780		    elsif ( $info->{qualifier} eq "tv_series" ) {
781			$self->status("ignoring close hit on tv series \"$info->{key}\"");
782			#$self->status("close enough hit on tv series \"$info->{key}\" (off by $yearsOff years)");
783		    }
784		    elsif ( $info->{qualifier} eq "tv_mini_series" ) {
785			$self->status("ignoring close hit on tv mini-series \"$info->{key}\"");
786			#$self->status("close enough hit on tv mini-series \"$info->{key}\" (off by $yearsOff years)");
787		    }
788		    else {
789			$self->error("$self->{moviedbIndex} responded with wierd entry for \"$info->{key}\"");
790			$self->error("weird trailing qualifier \"$info->{qualifier}\"");
791			$self->error("submit bug report to xmltv-devel\@lists.sf.net");
792		    }
793		}
794	    }
795	}
796
797	# if we found at least something, but nothing matched
798	# produce warnings about missed, but close matches
799	for my $info (@closeMatches) {
800	    next if ( !defined($info) );
801
802	    # within one year with exact match good enough
803	    if ( lc($mytitle) eq lc($info->{title}) ) {
804		my $yearsOff=abs(int($info->{year})-$year);
805		if ( $yearsOff <= 2 ) {
806		    #die "internal error: key \"$info->{key}\" failed to be processed properly";
807		}
808		elsif ( $yearsOff <= 5 ) {
809		    # report these as status
810		    $self->status("ignoring close, but not good enough hit on \"$info->{key}\" (off by $yearsOff years)");
811		}
812		else {
813		    # report these as debug messages
814		    $self->debug("ignoring close hit on \"$info->{key}\" (off by $yearsOff years)");
815		}
816	    }
817	    else {
818		$self->debug("ignoring close hit on \"$info->{key}\" (title did not match)");
819	    }
820	}
821    }
822    #$self->status("failed to lookup \"$title ($year)\"");
823    return(undef);
824}
825
826sub findTVSeriesInfo($$)
827{
828    my ($self, $title)=@_;
829
830    if ( $self->{cacheLookups} ) {
831	my $id=$self->{cachedLookups}->{tv_series}->{$title};
832
833	if ( defined($id) ) {
834	    #print STDERR "REF= (".ref($id).")\n";
835	    if ( $id ne '' ) {
836		return($id);
837	    }
838	    return(undef);
839	}
840    }
841
842    my @titles=@{alternativeTitles($title)};
843
844    # try an exact match first :)
845    my $idInfo;
846
847    for my $mytitle ( @titles ) {
848	# try close hit if only one :)
849	my $cnt=0;
850	my @closeMatches=$self->getMovieCloseMatches("$mytitle");
851
852	for my $info (@closeMatches) {
853	    next if ( !defined($info) );
854	    $cnt++;
855
856	    if ( lc($mytitle) eq lc($info->{title}) ) {
857
858		$info->{matchLevel}="perfect";
859
860		if ( $info->{qualifier} eq "movie" ) {
861		    #$self->status("ignoring close hit on movie \"$info->{key}\"");
862		}
863		elsif ( $info->{qualifier} eq "tv_movie" ) {
864		    #$self->status("ignoring close hit on tv movie \"$info->{key}\"");
865		}
866		elsif ( $info->{qualifier} eq "video_movie" ) {
867		    #$self->status("ignoring close hit on made-for-video-movie \"$info->{key}\"");
868		}
869		elsif ( $info->{qualifier} eq "video_game" ) {
870		    #$self->status("ignoring close hit on made-for-video-movie \"$info->{key}\"");
871		    next;
872		}
873		elsif ( $info->{qualifier} eq "tv_series" ) {
874		    $idInfo=$info;
875		    $self->status("perfect hit on tv series \"$info->{key}\"");
876		    last;
877		}
878		elsif ( $info->{qualifier} eq "tv_mini_series" ) {
879		    $idInfo=$info;
880		    $self->status("perfect hit on tv mini-series \"$info->{key}\"");
881		    last;
882		}
883		else {
884		    $self->error("$self->{moviedbIndex} responded with wierd entry for \"$info->{key}\"");
885		    $self->error("weird trailing qualifier \"$info->{qualifier}\"");
886		    $self->error("submit bug report to xmltv-devel\@lists.sf.net");
887		}
888	    }
889	}
890	last if ( defined($idInfo) );
891    }
892
893    if ( $self->{cacheLookups} ) {
894	# flush cache after this lookup if its gotten too big
895	if ( $self->{cachedLookups}->{tv_series}->{_cacheSize_} >
896	     $self->{cacheLookupSize} ) {
897	    delete($self->{cachedLookups}->{tv_series});
898	    $self->{cachedLookups}->{tv_series}->{_cacheSize_}=0;
899	}
900	if ( defined($idInfo) ) {
901	    $self->{cachedLookups}->{tv_series}->{$title}=$idInfo;
902	}
903	else {
904	    $self->{cachedLookups}->{tv_series}->{$title}="";
905	}
906	$self->{cachedLookups}->{tv_series}->{_cacheSize_}++;
907    }
908    if ( defined($idInfo) ) {
909	return($idInfo);
910    }
911    else {
912	#$self->status("failed to lookup tv series \"$title\"");
913	return(undef);
914    }
915}
916
917#
918# todo - add country of origin
919# todo - video (colour/aspect etc) details
920# todo - audio (stereo) details
921# todo - ratings ? - use certificates.list
922# todo - add description - plot summaries ? - which one do we choose ?
923# todo - writer
924# todo - producer
925# todo - running time (duration)
926# todo - identify 'Host' and 'Narrator's and put them in as
927#        credits:presenter and credits:commentator resp.
928# todo - check program length - probably a warning if longer ?
929#        can we update length (separate from runnning time in the output ?)
930# todo - icon - url from www.imdb.com of programme image ?
931#        this could be done by scraping for the hyper linked poster
932#        <a name="poster"><img src="http://ia.imdb.com/media/imdb/01/I/60/69/80m.jpg" height="139" width="99" border="0"></a>
933#        and grabbin' out the img entry. (BTW ..../npa.jpg seems to line up with no poster available)
934#
935#
936sub applyFound($$$)
937{
938    my ($self, $prog, $idInfo)=@_;
939
940    my $title=$prog->{title}->[0]->[0];
941
942    if ( $self->{updateDates} ) {
943	my $date;
944
945	# don't add dates only fix them for tv_series
946	if ( $idInfo->{qualifier} eq "movie" ||
947	     $idInfo->{qualifier} eq "video_movie" ||
948	     $idInfo->{qualifier} eq "tv_movie" ) {
949	    #$self->debug("adding 'date' field (\"$idInfo->{year}\") on \"$title\"");
950	    $date=int($idInfo->{year});
951	}
952	else {
953	    #$self->debug("not adding 'date' field to $idInfo->{qualifier} \"$title\"");
954	    $date=undef;
955	}
956
957	if ( $self->{replaceDates} ) {
958	    if ( defined($prog->{date}) && defined($date) ) {
959		$self->debug("replacing 'date' field");
960		delete($prog->{date});
961		$prog->{date}=$date;
962	    }
963	}
964	else {
965	    # only set date if not already defined
966	    if ( !defined($prog->{date}) && defined($date) ) {
967		$prog->{date}=$date;
968	    }
969	}
970    }
971
972    if ( $self->{updateTitles} ) {
973	if ( $idInfo->{title} ne $title ) {
974	    if ( $self->{replaceTitles} ) {
975		$self->debug("replacing (all) 'title' from \"$title\" to \"$idInfo->{title}\"");
976		delete($prog->{title});
977	    }
978
979	    my @list;
980
981	    push(@list, [$idInfo->{title}, undef]);
982
983	    if ( defined($prog->{title}) ) {
984		my $name=$idInfo->{title};
985		my $found=0;
986		for my $v (@{$prog->{title}}) {
987		    if ( lc($v->[0]) eq lc($name) ) {
988			$found=1;
989		    }
990		    else {
991			push(@list, $v);
992		    }
993		}
994	    }
995	    $prog->{title}=\@list;
996	}
997    }
998
999    if ( $self->{updateURLs} ) {
1000	if ( $self->{replaceURLs} ) {
1001	    if ( defined($prog->{url}) ) {
1002		$self->debug("replacing (all) 'url'");
1003		delete($prog->{url});
1004	    }
1005	}
1006
1007	# add url to programme on www.imdb.com
1008	my $url=$idInfo->{key};
1009
1010	$url=~s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/oeg;
1011	$url="http://us.imdb.com/M/title-exact?".$url;
1012
1013	if ( defined($prog->{url}) ) {
1014	    my @rep;
1015	    push(@rep, $url);
1016	    for (@{$prog->{url}}) {
1017		# skip urls for imdb.com that we're probably safe to replace
1018		if ( !m;^http://us.imdb.com/M/title-exact;o ) {
1019		    push(@rep, $_);
1020		}
1021	    }
1022	    $prog->{url}=\@rep;
1023	}
1024	else {
1025	    push(@{$prog->{url}}, $url);
1026	}
1027    }
1028
1029    # squirrel away movie qualifier so its first on the list of replacements
1030    my @categories;
1031    push(@categories, [$self->{categories}->{$idInfo->{qualifier}}, 'en']);
1032    if ( !defined($self->{categories}->{$idInfo->{qualifier}}) ) {
1033	die "how did we get here with an invalid qualifier '$idInfo->{qualifier}'";
1034    }
1035
1036    my $details=$self->getMovieIdDetails($idInfo->{id});
1037    if ( $details->{noDetails} ) {
1038	# we don't have any details on this movie
1039    }
1040    else {
1041	# add directors list
1042	if ( $self->{updateDirectors} && defined($details->{directors}) ) {
1043	    # only update directors if we have exactly one or if
1044	    # its a movie of some kind, add more than one.
1045	    if ( scalar(@{$details->{directors}}) == 1 ||
1046		 $idInfo->{qualifier} eq "movie" ||
1047		 $idInfo->{qualifier} eq "video_movie" ||
1048		 $idInfo->{qualifier} eq "tv_movie" ) {
1049
1050		if ( $self->{replaceDirectors} ) {
1051		    if ( defined($prog->{credits}->{director}) ) {
1052			$self->debug("replacing director(s)");
1053			delete($prog->{credits}->{director});
1054		    }
1055		}
1056
1057		my @list;
1058		# add top 3 billing directors list form www.imdb.com
1059		for my $name (splice(@{$details->{directors}},0,3)) {
1060		    push(@list, $name);
1061		}
1062
1063		# preserve all existing directors listed if we did't already have them.
1064		if ( defined($prog->{credits}->{director}) ) {
1065		    for my $name (@{$prog->{credits}->{director}}) {
1066			my $found=0;
1067			for(@list) {
1068			    if ( lc eq lc($name) ) {
1069				$found=1;
1070			    }
1071			}
1072			if ( !$found ) {
1073			    push(@list, $name);
1074			}
1075		    }
1076		}
1077		$prog->{credits}->{director}=\@list;
1078	    }
1079	    else {
1080		$self->debug("not adding 'director' field to $idInfo->{qualifier} \"$title\"");
1081	    }
1082	}
1083
1084	if ( $self->{updateActors} && defined($details->{actors}) ) {
1085	    if ( $self->{replaceActors} ) {
1086		if ( defined($prog->{credits}->{actor}) ) {
1087		    $self->debug("replacing actor(s) on $idInfo->{qualifier} \"$idInfo->{key}\"");
1088		    delete($prog->{credits}->{actor});
1089		}
1090	    }
1091
1092	    my @list;
1093	    # add top billing actors (default = 3) from www.imdb.com
1094	    for my $name (splice(@{$details->{actors}},0,$self->{numActors})) {
1095		push(@list, $name);
1096	    }
1097	    # preserve all existing actors listed if we did't already have them.
1098	    if ( defined($prog->{credits}->{actor}) ) {
1099		for my $name (@{$prog->{credits}->{actor}}) {
1100		    my $found=0;
1101		    for(@list) {
1102			if ( lc eq lc($name) ) {
1103			    $found=1;
1104			}
1105		    }
1106		    if ( !$found ) {
1107			push(@list, $name);
1108		    }
1109		}
1110	    }
1111	    $prog->{credits}->{actor}=\@list;
1112	}
1113
1114	if ( $self->{updatePresentors} && defined($details->{presenter}) ) {
1115	    if ( $self->{replacePresentors} ) {
1116		if ( defined($prog->{credits}->{presenter}) ) {
1117		    $self->debug("replacing presentor");
1118		    delete($prog->{credits}->{presenter});
1119		}
1120	    }
1121	    $prog->{credits}->{presenter}=$details->{presenter};
1122	}
1123	if ( $self->{updateCommentators} && defined($details->{commentator}) ) {
1124	    if ( $self->{replaceCommentators} ) {
1125		if ( defined($prog->{credits}->{commentator}) ) {
1126		    $self->debug("replacing commentator");
1127		    delete($prog->{credits}->{commentator});
1128		}
1129	    }
1130	    $prog->{credits}->{commentator}=$details->{commentator};
1131	}
1132
1133	# push genres as categories
1134	if ( $self->{updateCategoriesWithGenres} ) {
1135	    if ( defined($details->{genres}) ) {
1136		for (@{$details->{genres}}) {
1137		    push(@categories, [$_, 'en']);
1138		}
1139	    }
1140	}
1141
1142	if ( $self->{updateStarRatings} && defined($details->{ratingRank}) ) {
1143	    if ( $self->{replaceStarRatings} ) {
1144		if ( defined($prog->{'star-rating'}) ) {
1145		    $self->debug("replacing 'star-rating'");
1146		    delete($prog->{'star-rating'});
1147		}
1148		unshift( @{$prog->{'star-rating'}}, [ $details->{ratingRank} . "/10", 'IMDB User Rating' ] );
1149	    }
1150	    else {
1151	        # add IMDB User Rating in front of all other star-ratings
1152		unshift( @{$prog->{'star-rating'}}, [ $details->{ratingRank} . "/10", 'IMDB User Rating' ] );
1153	    }
1154	}
1155
1156    if ( $self->{updateKeywords} ) {
1157        my @keywords;
1158        if ( defined($details->{keywords}) ) {
1159            for (@{$details->{keywords}}) {
1160                push(@keywords, [$_, 'en']);
1161            }
1162        }
1163
1164        if ( $self->{replaceKeywords} ) {
1165            if ( defined($prog->{keywords}) ) {
1166                $self->debug("replacing (all) 'keywords'");
1167                delete($prog->{keywords});
1168            }
1169        }
1170        if ( defined($prog->{keyword}) ) {
1171	    for my $value (@{$prog->{keyword}}) {
1172		my $found=0;
1173		for my $k (@keywords) {
1174		    if ( lc($k->[0]) eq lc($value->[0]) ) {
1175			$found=1;
1176		    }
1177		}
1178		if ( !$found ) {
1179		    push(@keywords, $value);
1180		}
1181	    }
1182	}
1183	$prog->{keyword}=\@keywords;
1184    }
1185
1186        if ( $self->{updatePlot} ) {
1187            # plot is held as a <desc> entity
1188            # if 'replacePlot' then delete all existing <desc> entities and add new
1189            # else add this plot as an additional <desc> entity
1190            #
1191            if ( $self->{replacePlot} ) {
1192                if ( defined($prog->{desc}) ) {
1193                    $self->debug("replacing (all) 'desc'");
1194                    delete($prog->{desc});
1195                }
1196            }
1197            if ( defined($details->{plot}) ) {
1198                # check it's not already there
1199                my $found = 0;
1200                for my $_desc ( @{$prog->{desc}} ) {
1201                    $found = 1  if ( @{$_desc}[0] eq $details->{plot} );
1202                }
1203                push @{$prog->{desc}}, [ $details->{plot}, 'en' ]  if !$found;
1204            }
1205        }
1206
1207    }
1208
1209    if ( $self->{updateCategories} ) {
1210	if ( $self->{replaceCategories} ) {
1211	    if ( defined($prog->{category}) ) {
1212		$self->debug("replacing (all) 'category'");
1213		delete($prog->{category});
1214	    }
1215	}
1216	if ( defined($prog->{category}) ) {
1217	    for my $value (@{$prog->{category}}) {
1218		my $found=0;
1219		#print "checking category $value->[0] with $mycategory\n";
1220		for my $c (@categories) {
1221		    if ( lc($c->[0]) eq lc($value->[0]) ) {
1222			$found=1;
1223		    }
1224		}
1225		if ( !$found ) {
1226		    push(@categories, $value);
1227		}
1228	    }
1229	}
1230	$prog->{category}=\@categories;
1231    }
1232
1233    return($prog);
1234}
1235
1236sub augmentProgram($$$)
1237{
1238    my ($self, $prog, $movies_only)=@_;
1239
1240    $self->{stats}->{programCount}++;
1241
1242    # assume first title in first language is the one we want.
1243    my $title=$prog->{title}->[0]->[0];
1244
1245    if ( defined($prog->{date}) && $prog->{date}=~m/^\d\d\d\d$/o ) {
1246
1247	# for programs with dates we try:
1248	# - exact matches on movies
1249	# - exact matches on tv series
1250	# - close matches on movies
1251	my $id=$self->findMovieInfo($title, $prog->{date}, 1); # exact match
1252	if ( !defined($id) ) {
1253	    $id=$self->findTVSeriesInfo($title);
1254	    if ( !defined($id) ) {
1255		$id=$self->findMovieInfo($title, $prog->{date}, 0); # close match
1256	    }
1257	}
1258	if ( defined($id) ) {
1259	    $self->{stats}->{$id->{matchLevel}."Matches"}++;
1260	    $self->{stats}->{$id->{matchLevel}}->{$id->{qualifier}}++;
1261	    return($self->applyFound($prog, $id));
1262	}
1263	$self->status("failed to find a match for movie \"$title ($prog->{date})\"");
1264	return(undef);
1265	# fall through and try again as a tv series
1266    }
1267
1268    if ( !$movies_only ) {
1269	my $id=$self->findTVSeriesInfo($title);
1270	if ( defined($id) ) {
1271	    $self->{stats}->{$id->{matchLevel}."Matches"}++;
1272	    $self->{stats}->{$id->{matchLevel}}->{$id->{qualifier}}++;
1273	    return($self->applyFound($prog, $id));
1274	}
1275
1276	if ( 0 ) {
1277	    # this has hard to support 'close' results, unless we know
1278	    # for certain we're looking for a movie (ie duration etc)
1279	    # this is a bad idea.
1280	    my $id=$self->findMovieInfo($title, undef, 2); # any title match
1281	    if ( defined($id) ) {
1282		$self->{stats}->{$id->{matchLevel}."Matches"}++;
1283		$self->{stats}->{$id->{matchLevel}}->{$id->{qualifier}}++;
1284		return($self->applyFound($prog, $id));
1285	    }
1286	}
1287	$self->status("failed to find a match for show \"$title\"");
1288    }
1289    return(undef);
1290}
1291
1292#
1293# todo - add in stats on other things added (urls ?, actors, directors,categories)
1294#        separate out from what was added or updated
1295#
1296sub getStatsLines($)
1297{
1298    my $self=shift;
1299    my $totalChannelsParsed=shift;
1300
1301    my $endTime=time();
1302    my %stats=%{$self->{stats}};
1303
1304    my $ret=sprintf("Checked %d programs, on %d channels\n", $stats{programCount}, $totalChannelsParsed);
1305
1306    for my $cat (sort keys %{$self->{categories}}) {
1307	$ret.=sprintf("  found %d %s titles", $stats{perfect}->{$cat}+$stats{close}->{$cat},
1308		      $self->{categories}->{$cat});
1309	if ( $stats{close}->{$cat} != 0 ) {
1310	    if ( $stats{close}->{$cat} == 1 ) {
1311		$ret.=sprintf(" (%d was not perfect)", $stats{close}->{$cat});
1312	    }
1313	    else {
1314		$ret.=sprintf(" (%d were not perfect)", $stats{close}->{$cat});
1315	    }
1316	}
1317	$ret.="\n";
1318    }
1319
1320    $ret.=sprintf("  augmented %.2f%% of the programs, parsing %.2f programs/sec\n",
1321		  ($stats{programCount}!=0)?(($stats{perfectMatches}+$stats{closeMatches})*100)/$stats{programCount}:0,
1322		  ($endTime!=$stats{startTime} && $stats{programCount} != 0)?
1323		  $stats{programCount}/($endTime-$stats{startTime}):0);
1324
1325    return($ret);
1326}
1327
13281;
1329
1330# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1331# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1332
1333package XMLTV::IMDB::Crunch;
1334use LWP;
1335
1336use open ':encoding(iso-8859-1)';   # try to enforce file encoding (does this work in Perl <5.8.1? )
1337
1338# Use Term::ProgressBar if installed.
1339use constant Have_bar => eval {
1340    require Term::ProgressBar;
1341    $Term::ProgressBar::VERSION >= 2;
1342};
1343
1344#
1345# This package parses and manages to index imdb plain text files from
1346# ftp.imdb.com/interfaces. (see http://www.imdb.com/interfaces for
1347# details)
1348#
1349# I might, given time build a download manager that:
1350#    - downloads the latest plain text files
1351#    - understands how to download each week's diffs and apply them
1352# Currently, the 'downloadMissingFiles' flag in the hash of attributes
1353# passed triggers a simple-minded downloader.
1354#
1355# I may also roll this project into a xmltv-free imdb-specific
1356# perl interface that just supports callbacks and understands more of
1357# the imdb file formats.
1358#
1359
1360sub new
1361{
1362    my ($type) = shift;
1363    my $self={ @_ };            # remaining args become attributes
1364    for ($self->{downloadMissingFiles}) {
1365	$_=0 if not defined; # default
1366    }
1367
1368    for ('imdbDir', 'verbose') {
1369	die "invalid usage - no $_" if ( !defined($self->{$_}));
1370    }
1371
1372    $self->{stageLast} = 9;     # set the final stage in the build - i.e. the one which builds the final database
1373    $self->{stages} = { 1=>'movies', 2=>'directors', 3=>'actors', 4=>'actresses', 5=>'genres', 6=>'ratings', 7=>'keywords', 8=>'plot' };
1374    $self->{optionalStages} = { 'keywords' => 7, 'plot' => 8 };     # list of optional stages - no need to download files for these
1375
1376    $self->{moviedbIndex}="$self->{imdbDir}/moviedb.idx";
1377    $self->{moviedbData}="$self->{imdbDir}/moviedb.dat";
1378    $self->{moviedbInfo}="$self->{imdbDir}/moviedb.info";
1379    $self->{moviedbOffline}="$self->{imdbDir}/moviedb.offline";
1380
1381    # only leave progress bar on if its available
1382    if ( !Have_bar ) {
1383	$self->{showProgressBar}=0;
1384    }
1385
1386    bless($self, $type);
1387
1388    if ( $self->{stageToRun} ne $self->{stageLast} ) {
1389        # unless this is the last stage, check we have the necessary files
1390        return(undef)  if ( $self->checkFiles() != 0 );
1391    }
1392
1393    return($self);
1394}
1395
1396
1397sub checkFiles () {
1398
1399    my ($self)=@_;
1400
1401    if ( ! -d "$self->{imdbDir}" ) {
1402	if ( $self->{downloadMissingFiles} ) {
1403	    warn "creating directory $self->{imdbDir}\n";
1404	    mkdir $self->{imdbDir}, 0777
1405	      or die "cannot mkdir $self->{imdbDir}: $!";
1406	}
1407	else {
1408	    die "$self->{imdbDir}:does not exist";
1409	}
1410    }
1411    my $listsDir = "$self->{imdbDir}/lists";
1412    if ( ! -d $listsDir ) {
1413	mkdir $listsDir, 0777 or die "cannot mkdir $listsDir: $!";
1414    }
1415
1416  CHECK_FILES:
1417    my %missingListFiles; # maps 'movies' to filename ...movies.gz
1418
1419    FILES_CHECK:
1420    while ( my( $key, $value ) = each %{ $self->{stages} } ) {
1421        # don't check *all* files - only the ones we are crunching
1422        next FILES_CHECK  if ( lc($self->{stageToRun}) ne 'all' && $key != int($self->{stageToRun}) );
1423	my $file=$value;
1424	my $filename="$listsDir/$file.list";
1425	my $filenameGz="$filename.gz";
1426	my $filenameExists = -f $filename;
1427	my $filenameSize = -s $filename;
1428	my $filenameGzExists = -f $filenameGz;
1429	my $filenameGzSize = -s $filenameGz;
1430
1431	if ( $filenameExists and not $filenameSize ) {
1432	    warn "removing zero-length $filename\n";
1433	    unlink $filename or die "cannot unlink $filename: $!";
1434	    $filenameExists = 0;
1435	}
1436	if ( $filenameGzExists and not $filenameGzSize ) {
1437	    warn "removing zero-length $filenameGz\n";
1438	    unlink $filenameGz or die "cannot unlink $filenameGz: $!";
1439	    $filenameGzExists = 0;
1440	}
1441
1442	if ( not $filenameExists and not $filenameGzExists ) {
1443	    # Just report one of the filenames, keep the message simple.
1444	    warn "$filenameGz does not exist\n";
1445            if ( $self->{optionalStages}{$file} ) {
1446                warn "$file will not be added to database\n";
1447            } else {
1448                $missingListFiles{$file}=$filenameGz;
1449	}
1450	}
1451	elsif ( not $filenameExists and $filenameGzExists ) {
1452	    $self->{imdbListFiles}->{$file}=$filenameGz;
1453	}
1454	elsif ( $filenameExists and not $filenameGzExists ) {
1455	    $self->{imdbListFiles}->{$file}=$filename;
1456	}
1457	elsif ( $filenameExists and $filenameGzExists ) {
1458	    die "both $filename and $filenameGz exist, remove one of them\n";
1459	}
1460	else { die }
1461    }
1462    if ( $self->{downloadMissingFiles} ) {
1463	my $baseUrl = 'ftp://ftp.fu-berlin.de/pub/misc/movies/database';
1464	foreach ( sort keys %missingListFiles ) {
1465	    my $url = "$baseUrl/$_.list.gz";
1466	    my $filename = delete $missingListFiles{$_};
1467	    my $partial = "$filename.partial";
1468	    if (-e $partial) {
1469		if (not -s $partial) {
1470		    print STDERR "removing empty $partial\n";
1471		    unlink $partial or die "cannot unlink $partial: $!";
1472		}
1473		else {
1474		    die <<END
1475$partial already exists, remove it or try renaming to $filename and
1476resuming the download of <$url> by hand.
1477
1478END
1479  ;
1480		}
1481	    }
1482
1483	    print STDERR <<END
1484Trying to download <$url>.
1485With a slow network link this could fail; it might be better to
1486download the file by hand and save it as
1487$filename.
1488
1489END
1490  ;
1491	    # For downloading we use LWP
1492	    #
1493	    my $ua = LWP::UserAgent->new();
1494	    $ua->env_proxy();
1495	    $ua->show_progress(1);
1496
1497	    my $req = HTTP::Request->new(GET => $url);
1498	    $req->authorization_basic('anonymous', 'tv_imdb');
1499
1500	    my $resp = $ua->request($req, $filename);
1501	    my $got_size = -s $filename;
1502	    if (defined $resp and $resp->is_success ) {
1503		die if not $got_size;
1504		print STDERR "<$url>\n\t-> $filename, success\n\n";
1505	    }
1506	    else {
1507		my $msg = "failed to download $url to $filename";
1508		$msg .= ", http response code: ".$resp->status_line if defined $resp;
1509		warn $msg;
1510		if ($got_size) {
1511		    warn "renaming $filename -> $partial\n";
1512		    rename $filename, $partial
1513		      or die "cannot rename $filename to $partial: $!";
1514		    warn "You might try continuing the download of <$url> manually.\n";
1515		}
1516		exit(1);
1517	    }
1518	}
1519	$self->{downloadMissingFiles} = 0;
1520	goto CHECK_FILES;
1521    }
1522
1523    if ( %missingListFiles ) {
1524	print STDERR "tv_imdb: requires you to download the above files from ftp.imdb.com\n";
1525	print STDERR "         see http://www.imdb.com/interfaces for details\n";
1526        print STDERR "         or try the --download option\n";
1527	#return(undef);
1528        return 1;
1529    }
1530
1531    return 0;
1532}
1533
1534sub redirect($$)
1535{
1536    my ($self, $file)=@_;
1537
1538    if ( defined($file) ) {
1539	if ( !open($self->{logfd}, "> $file") ) {
1540	    print STDERR "$file:$!\n";
1541	    return(0);
1542	}
1543	$self->{errorCountInLog}=0;
1544    }
1545    else {
1546	close($self->{logfd});
1547	$self->{logfd}=undef;
1548    }
1549    return(1);
1550}
1551
1552sub error($$)
1553{
1554    my $self=shift;
1555    if ( defined($self->{logfd}) ) {
1556	print {$self->{logfd}} $_[0]."\n";
1557	$self->{errorCountInLog}++;
1558    }
1559    else {
1560	print STDERR $_[0]."\n";
1561    }
1562}
1563
1564sub status($$)
1565{
1566    my $self=shift;
1567
1568    if ( $self->{verbose} ) {
1569	print STDERR $_[0]."\n";
1570    }
1571}
1572
1573sub withThousands ($)
1574{
1575    my ($val) = @_;
1576    $val =~ s/(\d{1,3}?)(?=(\d{3})+$)/$1,/g;
1577    return $val;
1578}
1579
1580use XMLTV::Gunzip;
1581use IO::File;
1582
1583sub openMaybeGunzip($)
1584{
1585    for ( shift ) {
1586	return gunzip_open($_) if m/\.gz$/;
1587	return new IO::File("< $_");
1588    }
1589}
1590
1591sub closeMaybeGunzip($$)
1592{
1593    if ( $_[0]=~m/\.gz$/o ) {
1594	# Would close($fh) but that causes segfaults on my system.
1595	# Investigating, but in the meantime just leave it open.
1596	#
1597	#return gunzip_close($_[1]);
1598    }
1599
1600    # Apparently this can also segfault (wtf?).
1601    #return close($_[1]);
1602}
1603
1604sub readMoviesOrGenres($$$$)
1605{
1606    my ($self, $whichMoviesOrGenres, $countEstimate, $file)=@_;
1607    my $startTime=time();
1608    my $header;
1609    my $whatAreWeParsing;
1610    my $lineCount=0;
1611
1612    if ( $whichMoviesOrGenres eq "Movies" ) {
1613	$header="MOVIES LIST";
1614	$whatAreWeParsing=1;
1615    }
1616    elsif ( $whichMoviesOrGenres eq "Genres" ) {
1617	$header="8: THE GENRES LIST";
1618	$whatAreWeParsing=2;
1619    }
1620    my $fh = openMaybeGunzip($file) || return(-2);
1621    while(<$fh>) {
1622	$lineCount++;
1623	if ( m/^$header/ ) {
1624	    if ( !($_=<$fh>) || !m/^===========/o ) {
1625		$self->error("missing ======= after $header at line $lineCount");
1626		closeMaybeGunzip($file, $fh);
1627		return(-1);
1628	    }
1629	    if ( !($_=<$fh>) || !m/^\s*$/o ) {
1630		$self->error("missing empty line after ======= at line $lineCount");
1631		closeMaybeGunzip($file, $fh);
1632		return(-1);
1633	    }
1634	    last;
1635	}
1636	elsif ( $lineCount > 1000 ) {
1637	    $self->error("$file: stopping at line $lineCount, didn't see \"$header\" line");
1638	    closeMaybeGunzip($file, $fh);
1639	    return(-1);
1640	}
1641    }
1642
1643    my $progress=Term::ProgressBar->new({name  => "parsing $whichMoviesOrGenres",
1644					 count => $countEstimate,
1645					 ETA   => 'linear'})
1646	if ( $self->{showProgressBar} );
1647
1648    $progress->minor(0) if ($self->{showProgressBar});
1649    $progress->max_update_rate(1) if ($self->{showProgressBar});
1650    my $next_update=0;
1651
1652    my $count=0;
1653    while(<$fh>) {
1654	$lineCount++;
1655	my $line=$_;
1656	#print "read line $lineCount:$line\n";
1657
1658	# end is line consisting of only '-'
1659	last if ( $line=~m/^\-\-\-\-\-\-\-+/o );
1660
1661	$line=~s/\n$//o;
1662
1663	my $tab=index($line, "\t");
1664	if ( $tab != -1 ) {
1665	    my $mkey=substr($line, 0, $tab);
1666
1667	    next if ($mkey=~m/\s*\{\{SUSPENDED\}\}/o);
1668
1669	    if ( $whatAreWeParsing == 2 ) {
1670		# don't see what these are...?
1671		# ignore {{SUSPENDED}}
1672		$mkey=~s/\s*\{\{SUSPENDED\}\}//o;
1673
1674		# ignore {Twelve Angry Men (1954)}
1675		$mkey=~s/\s*\{[^\}]+\}//go;
1676
1677		# skip enties that have {} in them since they're tv episodes
1678		#next if ( $mkey=~s/\s*\{[^\}]+\}$//o );
1679
1680		my $genre=substr($line, $tab);
1681
1682		# genres sometimes has more than one tab
1683		$genre=~s/^\t+//og;
1684		if ( defined($self->{movies}{$mkey}) ) {
1685		    $self->{movies}{$mkey}.="|".$genre;
1686		}
1687		else {
1688		    $self->{movies}{$mkey}=$genre;
1689		    # returned count is number of unique titles found
1690		    $count++;
1691		}
1692	    }
1693	    else {
1694		push(@{$self->{movies}}, $mkey);
1695		# returned count is number of titles found
1696		$count++;
1697	    }
1698
1699	    if ( $self->{showProgressBar} ) {
1700		# re-adjust target so progress bar doesn't seem too wonky
1701		if ( $count > $countEstimate ) {
1702		    $countEstimate = $progress->target($count+1000);
1703		    $next_update=$progress->update($count);
1704		}
1705		elsif ( $count > $next_update ) {
1706		    $next_update=$progress->update($count);
1707		}
1708	    }
1709	}
1710	else {
1711	    $self->error("$file:$lineCount: unrecognized format (missing tab)");
1712	    $next_update=$progress->update($count) if ($self->{showProgressBar});
1713	}
1714    }
1715    $progress->update($countEstimate) if ($self->{showProgressBar});
1716
1717    $self->status(sprintf("parsing $whichMoviesOrGenres found ".withThousands($count)." titles in ".
1718			  withThousands($lineCount)." lines in %d seconds",time()-$startTime));
1719
1720    closeMaybeGunzip($file, $fh);
1721    return($count);
1722}
1723
1724sub readCastOrDirectors($$$)
1725{
1726    my ($self, $whichCastOrDirector, $castCountEstimate, $file)=@_;
1727    my $startTime=time();
1728
1729    my $header;
1730    my $whatAreWeParsing;
1731    my $lineCount=0;
1732
1733    if ( $whichCastOrDirector eq "Actors" ) {
1734	$header="THE ACTORS LIST";
1735	$whatAreWeParsing=1;
1736    }
1737    elsif ( $whichCastOrDirector eq "Actresses" ) {
1738	$header="THE ACTRESSES LIST";
1739	$whatAreWeParsing=2;
1740    }
1741    elsif ( $whichCastOrDirector eq "Directors" ) {
1742	$header="THE DIRECTORS LIST";
1743	$whatAreWeParsing=3;
1744    }
1745    else {
1746	die "why are we here ?";
1747    }
1748
1749    my $fh = openMaybeGunzip($file) || return(-2);
1750    my $progress=Term::ProgressBar->new({name  => "parsing $whichCastOrDirector",
1751					 count => $castCountEstimate,
1752					 ETA   => 'linear'})
1753      if ($self->{showProgressBar});
1754    $progress->minor(0) if ($self->{showProgressBar});
1755    $progress->max_update_rate(1) if ($self->{showProgressBar});
1756    my $next_update=0;
1757    while(<$fh>) {
1758	$lineCount++;
1759	if ( m/^$header/ ) {
1760	    if ( !($_=<$fh>) || !m/^===========/o ) {
1761		$self->error("missing ======= after $header at line $lineCount");
1762		closeMaybeGunzip($file, $fh);
1763		return(-1);
1764	    }
1765	    if ( !($_=<$fh>) || !m/^\s*$/o ) {
1766		$self->error("missing empty line after ======= at line $lineCount");
1767		closeMaybeGunzip($file, $fh);
1768		return(-1);
1769	    }
1770	    if ( !($_=<$fh>) || !m/^Name\s+Titles\s*$/o ) {
1771		$self->error("missing name/titles line after ======= at line $lineCount");
1772		closeMaybeGunzip($file, $fh);
1773		return(-1);
1774	    }
1775	    if ( !($_=<$fh>) || !m/^[\s\-]+$/o ) {
1776		$self->error("missing name/titles suffix line after ======= at line $lineCount");
1777		closeMaybeGunzip($file, $fh);
1778		return(-1);
1779	    }
1780	    last;
1781	}
1782	elsif ( $lineCount > 1000 ) {
1783	    $self->error("$file: stopping at line $lineCount, didn't see \"$header\" line");
1784	    closeMaybeGunzip($file, $fh);
1785	    return(-1);
1786	}
1787    }
1788
1789    my $cur_name;
1790    my $count=0;
1791    my $castNames=0;
1792    while(<$fh>) {
1793	$lineCount++;
1794	my $line=$_;
1795	$line=~s/\n$//o;
1796	#$self->status("read line $lineCount:$line");
1797
1798	# end is line consisting of only '-'
1799	last if ( $line=~m/^\-\-\-\-\-\-\-+/o );
1800
1801	next if ( length($line) == 0 );
1802
1803	if ( $line=~s/^([^\t]+)\t+//o ) {
1804	    $cur_name=$1;
1805	    $castNames++;
1806
1807	    if ( $self->{showProgressBar} ) {
1808		# re-adjust target so progress bar doesn't seem too wonky
1809		if ( $castNames > $castCountEstimate ) {
1810		    $castCountEstimate = $progress->target($castNames+100);
1811		    $next_update=$progress->update($castNames);
1812		}
1813		elsif ( $castNames > $next_update ) {
1814		    $next_update=$progress->update($castNames);
1815		}
1816	    }
1817	}
1818
1819	my $billing;
1820	my $HostNarrator="";
1821	if ( $whatAreWeParsing < 3 ) {
1822	    # actors or actresses
1823	    $billing="9999";
1824	    if ( $line=~s/\s*<(\d+)>//o ) {
1825		$billing=sprintf("%04d", int($1));
1826	    }
1827
1828	    if ( (my $start=index($line, " [")) != -1 ) {
1829		#my $end=rindex($line, "]");
1830		my $ex=substr($line, $start+1);
1831
1832		if ( $ex=~s/Host//o ) {
1833		    if ( length($HostNarrator) ) {
1834			$HostNarrator.=",";
1835		    }
1836		    $HostNarrator.="Host";
1837		}
1838		if ( $ex=~s/Narrator//o ) {
1839		    if ( length($HostNarrator) ) {
1840			$HostNarrator.=",";
1841		    }
1842		    $HostNarrator.="Narrator";
1843		}
1844		$line=substr($line, 0, $start);
1845		# ignore character name
1846	    }
1847	}
1848	# try ignoring these
1849	next if ($line=~m/\s*\{\{SUSPENDED\}\}/o);
1850
1851	# don't see what these are...?
1852	# ignore {{SUSPENDED}}
1853	$line=~s/\s*\{\{SUSPENDED\}\}//o;
1854
1855  # [honir] this is wrong - this puts cast from all the episodes as though they are in the entire series!
1856	# ##ignore {Twelve Angry Men (1954)}
1857	$line=~s/\s*\{[^\}]+\}//o;
1858
1859	if ( $whatAreWeParsing < 3 ) {
1860	    if ( $line=~s/\s*\(aka ([^\)]+)\).*$//o ) {
1861		# $attr=$1;
1862	    }
1863	}
1864	if ( $line=~s/  (\(.*)$//o ) {
1865	    # $attrs=$1;
1866	}
1867	$line=~s/^\s+//og;
1868	$line=~s/\s+$//og;
1869
1870	if ( $whatAreWeParsing < 3 ) {
1871	    if ( $line=~s/\s+Narrator$//o ) {
1872		# ignore
1873	    }
1874	}
1875
1876	my $val=$self->{movies}{$line};
1877	my $name=$cur_name;
1878	if ( length($HostNarrator) ) {
1879	    $name.="[$HostNarrator]";
1880	}
1881	if ( defined($billing) ) {
1882	    if ( defined($val) ) {
1883		$self->{movies}{$line}=$val."|$billing:$name";
1884	    }
1885	    else {
1886		$self->{movies}{$line}="$billing:$name";
1887	    }
1888	}
1889	else {
1890	    if ( defined($val) ) {
1891		$self->{movies}{$line}=$val."|$name";
1892	    }
1893	    else {
1894		$self->{movies}{$line}=$name;
1895	    }
1896	}
1897	$count++;
1898    }
1899    $progress->update($castCountEstimate) if ($self->{showProgressBar});
1900
1901    $self->status(sprintf("parsing $whichCastOrDirector found ".withThousands($castNames)." names, ".
1902			  withThousands($count)." titles in ".withThousands($lineCount)." lines in %d seconds",time()-$startTime));
1903
1904    closeMaybeGunzip($file, $fh);
1905
1906    return($castNames);
1907}
1908
1909sub readRatings($$$$)
1910{
1911    my ($self, $countEstimate, $file)=@_;
1912    my $startTime=time();
1913    my $lineCount=0;
1914
1915    my $fh = openMaybeGunzip($file) || return(-2);
1916    while(<$fh>) {
1917	$lineCount++;
1918	if ( m/^MOVIE RATINGS REPORT/o ) {
1919	    if ( !($_=<$fh>) || !m/^\s*$/o) {
1920		$self->error("missing empty line after \"MOVIE RATINGS REPORT\" at line $lineCount");
1921		closeMaybeGunzip($file, $fh);
1922		return(-1);
1923	    }
1924	    if ( !($_=<$fh>) || !m/^New  Distribution  Votes  Rank  Title/o ) {
1925		$self->error("missing \"New  Distribution  Votes  Rank  Title\" at line $lineCount");
1926		closeMaybeGunzip($file, $fh);
1927		return(-1);
1928	    }
1929	    last;
1930	}
1931	elsif ( $lineCount > 1000 ) {
1932	    $self->error("$file: stopping at line $lineCount, didn't see \"MOVIE RATINGS REPORT\" line");
1933	    closeMaybeGunzip($file, $fh);
1934	    return(-1);
1935	}
1936    }
1937
1938    my $progress=Term::ProgressBar->new({name  => "parsing Ratings",
1939					 count => $countEstimate,
1940					 ETA   => 'linear'})
1941      if ($self->{showProgressBar});
1942
1943    $progress->minor(0) if ($self->{showProgressBar});
1944    $progress->max_update_rate(1) if ($self->{showProgressBar});
1945    my $next_update=0;
1946
1947    my $count=0;
1948    while(<$fh>) {
1949	$lineCount++;
1950	my $line=$_;
1951	#print "read line $lineCount:$line";
1952
1953	$line=~s/\n$//o;
1954
1955	# skip empty lines (only really appear right before last line ending with ----
1956	next if ( $line=~m/^\s*$/o );
1957	# end is line consisting of only '-'
1958	last if ( $line=~m/^\-\-\-\-\-\-\-+/o );
1959
1960        # e.g. New  Distribution  Votes  Rank  Title
1961        #            0000000133  225568   8.9  12 Angry Men (1957)
1962	if ( $line=~s/^\s+([\.|\*|\d]+)\s+(\d+)\s+(\d+)\.(\d+)\s+//o ) {
1963	    $self->{movies}{$line}=[$1,$2,"$3.$4"];
1964	    $count++;
1965	    if ( $self->{showProgressBar} ) {
1966		# re-adjust target so progress bar doesn't seem too wonky
1967		if ( $count > $countEstimate ) {
1968		    $countEstimate = $progress->target($count+1000);
1969		    $next_update=$progress->update($count);
1970		}
1971		elsif ( $count > $next_update ) {
1972		    $next_update=$progress->update($count);
1973		}
1974	    }
1975	}
1976	else {
1977	    $self->error("$file:$lineCount: unrecognized format");
1978	    $next_update=$progress->update($count) if ($self->{showProgressBar});
1979	}
1980    }
1981    $progress->update($countEstimate) if ($self->{showProgressBar});
1982
1983    $self->status(sprintf("parsing Ratings found ".withThousands($count)." titles in ".
1984			  withThousands($lineCount)." lines in %d seconds",time()-$startTime));
1985
1986    closeMaybeGunzip($file, $fh);
1987    return($count);
1988}
1989
1990sub readKeywords($$$$)
1991{
1992    my ($self, $countEstimate, $file)=@_;
1993    my $startTime=time();
1994    my $lineCount=0;
1995
1996    my $fh = openMaybeGunzip($file) || return(-2);
1997    while(<$fh>) {
1998	$lineCount++;
1999
2000	if ( m/THE KEYWORDS LIST/ ) {
2001	    if ( !($_=<$fh>) || !m/^===========/o ) {
2002		$self->error("missing ======= after \"THE KEYWORDS LIST\" at line $lineCount");
2003		closeMaybeGunzip($file, $fh);
2004		return(-1);
2005	    }
2006	    if ( !($_=<$fh>) || !m/^\s*$/o ) {
2007		$self->error("missing empty line after ======= at line $lineCount");
2008		closeMaybeGunzip($file, $fh);
2009		return(-1);
2010	    }
2011	    last;
2012	}
2013	elsif ( $lineCount > 100000 ) {
2014	    $self->error("$file: stopping at line $lineCount, didn't see \"THE KEYWORDS LIST\" line");
2015	    closeMaybeGunzip($file, $fh);
2016	    return(-1);
2017	}
2018    }
2019
2020    my $progress=Term::ProgressBar->new({name  => "parsing keywords",
2021					 count => $countEstimate,
2022					 ETA   => 'linear'})
2023      if ($self->{showProgressBar});
2024
2025    $progress->minor(0) if ($self->{showProgressBar});
2026    $progress->max_update_rate(1) if ($self->{showProgressBar});
2027    my $next_update=0;
2028
2029    my $count=0;
2030    while(<$fh>) {
2031	$lineCount++;
2032	my $line=$_;
2033	chomp($line);
2034	next if ($line =~ m/^\s*$/);
2035	my ($title, $keyword) = ($line =~ m/^(.*)\s+(\S+)\s*$/);
2036	if ( defined($title) and defined($keyword) ) {
2037
2038            my ($episode) = $title =~ m/^.*\s+(\{.*\})$/;
2039
2040            # ignore anything which is an episode (e.g. "{Doctor Who (#10.22)}" )
2041            if ( !defined $episode || $episode eq '' )
2042            {
2043                if ( defined($self->{movies}{$title}) ) {
2044                    $self->{movies}{$title}.=",".$keyword;
2045                } else {
2046                    $self->{movies}{$title}=$keyword;
2047                    # returned count is number of unique titles found
2048                    $count++;
2049                }
2050            }
2051
2052            if ( $self->{showProgressBar} ) {
2053                # re-adjust target so progress bar doesn't seem too wonky
2054    	        if ( $count > $countEstimate ) {
2055    	    	    $countEstimate = $progress->target($count+1000);
2056                    $next_update=$progress->update($count);
2057    	        }
2058    	        elsif ( $count > $next_update ) {
2059    	    	    $next_update=$progress->update($count);
2060    	        }
2061    	    }
2062        } else {
2063	    $self->error("$file:$lineCount: unrecognized format \"$line\"");
2064	    $next_update=$progress->update($count) if ($self->{showProgressBar});
2065	}
2066    }
2067    $progress->update($countEstimate) if ($self->{showProgressBar});
2068
2069    $self->status(sprintf("parsing Keywords found ".withThousands($count)." titles in ".
2070			  withThousands($lineCount)." lines in %d seconds",time()-$startTime));
2071
2072    closeMaybeGunzip($file, $fh);
2073    return($count);
2074}
2075
2076sub readPlots($$$$)
2077{
2078    my ($self, $countEstimate, $file)=@_;
2079    my $startTime=time();
2080    my $lineCount=0;
2081
2082    my $fh = openMaybeGunzip($file) || return(-2);
2083    while(<$fh>) {
2084	$lineCount++;
2085
2086	if ( m/PLOT SUMMARIES LIST/ ) {
2087	    if ( !($_=<$fh>) || !m/^===========/o ) {
2088		$self->error("missing ======= after \"PLOT SUMMARIES LIST\" at line $lineCount");
2089		closeMaybeGunzip($file, $fh);
2090		return(-1);
2091	    }
2092	    if ( !($_=<$fh>) || !m/^-----------/o ) {
2093		$self->error("missing ------- line after ======= at line $lineCount");
2094		closeMaybeGunzip($file, $fh);
2095		return(-1);
2096	    }
2097	    last;
2098	}
2099	elsif ( $lineCount > 500 ) {
2100	    $self->error("$file: stopping at line $lineCount, didn't see \"PLOT SUMMARIES LIST\" line");
2101	    closeMaybeGunzip($file, $fh);
2102	    return(-1);
2103	}
2104    }
2105
2106    my $progress=Term::ProgressBar->new({name  => "parsing plots",
2107					 count => $countEstimate,
2108					 ETA   => 'linear'})
2109      if ($self->{showProgressBar});
2110
2111    $progress->minor(0) if ($self->{showProgressBar});
2112    $progress->max_update_rate(1) if ($self->{showProgressBar});
2113    my $next_update=0;
2114
2115    my $count=0;
2116    while(<$fh>) {
2117	$lineCount++;
2118	my $line=$_;
2119	chomp($line);
2120	next if ($line =~ m/^\s*$/);
2121	my ($title, $episode) = ($line =~ m/^MV:\s(.*?)\s?(\{.*\})?$/);
2122	if ( defined($title) ) {
2123
2124            # ignore anything which is an episode (e.g. "{Doctor Who (#10.22)}" )
2125            if ( !defined $episode || $episode eq '' )
2126            {
2127                my $plot = '';
2128                LOOP:
2129                while (1) {
2130                    if ( $line = <$fh> ) {
2131                        $lineCount++;
2132                        chomp($line);
2133                        next if ($line =~ m/^\s*$/);
2134                        if ( $line =~ m/PL:\s(.*)$/ ) {     # plot summary is a number of lines starting "PL:"
2135                            $plot .= ($plot ne ''?' ':'') . $1;
2136                        }
2137                        last LOOP if ( $line =~ m/BY:\s(.*)$/ );     # the author line "BY:" signals the end of the plot summary
2138                    } else {
2139                        last LOOP;
2140                    }
2141                }
2142
2143                if ( !defined($self->{movies}{$title}) ) {
2144                    # ensure there's no tab chars in the plot or else the db stage will barf
2145                    $plot =~ s/\t//og;
2146                    $self->{movies}{$title}=$plot;
2147                    # returned count is number of unique titles found
2148                    $count++;
2149                }
2150            }
2151
2152            if ( $self->{showProgressBar} ) {
2153                # re-adjust target so progress bar doesn't seem too wonky
2154    	        if ( $count > $countEstimate ) {
2155    	    	    $countEstimate = $progress->target($count+1000);
2156                    $next_update=$progress->update($count);
2157    	        }
2158    	        elsif ( $count > $next_update ) {
2159    	    	    $next_update=$progress->update($count);
2160    	        }
2161    	    }
2162        } else {
2163            # skip lines up to the next "MV:"
2164            if ($line !~ m/^(---|PL:|BY:)/ ) {
2165                $self->error("$file:$lineCount: unrecognized format \"$line\"");
2166            }
2167	    $next_update=$progress->update($count) if ($self->{showProgressBar});
2168	}
2169    }
2170    $progress->update($countEstimate) if ($self->{showProgressBar});
2171
2172    $self->status(sprintf("parsing Plots found $count ".withThousands($count)." in ".
2173			  withThousands($lineCount)." lines in %d seconds",time()-$startTime));
2174
2175    closeMaybeGunzip($file, $fh);
2176    return($count);
2177}
2178
2179sub stageComplete($)
2180{
2181    my ($self, $stage)=@_;
2182
2183    if ( -f "$self->{imdbDir}/stage$stage.data" ) {
2184	return(1);
2185    }
2186    return(0);
2187}
2188
2189sub dbinfoLoad($)
2190{
2191    my $self=shift;
2192
2193    my $ret=XMLTV::IMDB::loadDBInfo($self->{moviedbInfo});
2194    if ( ref $ret eq 'SCALAR' ) {
2195	return($ret);
2196    }
2197    $self->{dbinfo}=$ret;
2198    return(undef);
2199}
2200
2201sub dbinfoAdd($$$)
2202{
2203    my ($self, $key, $value)=@_;
2204    $self->{dbinfo}->{$key}=$value;
2205}
2206
2207sub dbinfoGet($$$)
2208{
2209    my ($self, $key, $defaultValue)=@_;
2210    if ( defined($self->{dbinfo}->{$key}) ) {
2211	return($self->{dbinfo}->{$key});
2212    }
2213    return($defaultValue);
2214}
2215
2216sub dbinfoSave($)
2217{
2218    my $self=shift;
2219    open(INFO, "> $self->{moviedbInfo}") || return(1);
2220    for (sort keys %{$self->{dbinfo}}) {
2221	print INFO "".$_.":".$self->{dbinfo}->{$_}."\n";
2222    }
2223    close(INFO);
2224    return(0);
2225}
2226
2227sub dbinfoGetFileSize($$)
2228{
2229    my ($self, $key)=@_;
2230
2231    if ( !defined($self->{imdbListFiles}->{$key}) ) {
2232	die ("invalid call");
2233    }
2234    my $fileSize=int(-s "$self->{imdbListFiles}->{$key}");
2235
2236    # if compressed, then attempt to run gzip -l
2237    if ( $self->{imdbListFiles}->{$key}=~m/.gz$/) {
2238	if ( open(my $fd, "gzip -l ".$self->{imdbListFiles}->{$key}."|") ) {
2239	    # if parse fails, then defalt to wild ass guess of compression of 65%
2240	    $fileSize=int(($fileSize*100)/(100-65));
2241
2242	    while(<$fd>) {
2243		if ( m/^\s*\d+\s+(\d+)/ ) {
2244		    $fileSize=$1;
2245		}
2246	    }
2247	    close($fd);
2248	}
2249	else {
2250	    # wild ass guess of compression of 65%
2251	    $fileSize=int(($fileSize*100)/(100-65));
2252	}
2253    }
2254    return($fileSize);
2255}
2256
2257sub dbinfoCalcEstimate($$$)
2258{
2259    my ($self, $key, $estimateSizePerEntry)=@_;
2260
2261    my $fileSize=$self->dbinfoGetFileSize($key);
2262
2263    my $countEstimate=int($fileSize/$estimateSizePerEntry);
2264
2265    $self->dbinfoAdd($key."_list_file", $self->{imdbListFiles}->{$key});
2266    $self->dbinfoAdd($key."_list_file_size", int(-s "$self->{imdbListFiles}->{$key}"));
2267    $self->dbinfoAdd($key."_list_file_size_uncompressed", $fileSize);
2268    $self->dbinfoAdd($key."_list_count_estimate", $countEstimate);
2269    return($countEstimate);
2270}
2271
2272sub dbinfoCalcBytesPerEntry($$$)
2273{
2274    my ($self, $key, $calcActualForThisNumber)=@_;
2275
2276    my $fileSize=$self->dbinfoGetFileSize($key);
2277
2278    return(int($fileSize/$calcActualForThisNumber));
2279}
2280
2281sub invokeStage($$)
2282{
2283    my ($self, $stage)=@_;
2284
2285    my $startTime=time();
2286    if ( $stage == 1 ) {
2287	$self->status("parsing Movies list for stage $stage..");
2288	my $countEstimate=$self->dbinfoCalcEstimate("movies", 47);
2289
2290	my $num=$self->readMoviesOrGenres("Movies", $countEstimate, "$self->{imdbListFiles}->{movies}");
2291	if ( $num < 0 ) {
2292	    if ( $num == -2 ) {
2293		$self->error("you need to download $self->{imdbListFiles}->{movies} from ftp.imdb.com");
2294	    }
2295	    return(1);
2296	}
2297	elsif ( abs($num - $countEstimate) > $countEstimate*.10 ) {
2298	    my $better=$self->dbinfoCalcBytesPerEntry("movies", $num);
2299	    $self->status("ARG estimate of $countEstimate for movies needs updating, found $num ($better bytes/entry)");
2300	}
2301	$self->dbinfoAdd("db_stat_movie_count", "$num");
2302
2303	$self->status("writing stage1 data ..");
2304	{
2305	    my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2306	    my $progress=Term::ProgressBar->new({name  => "writing titles",
2307						 count => $countEstimate,
2308						 ETA   => 'linear'})
2309	      if ($self->{showProgressBar});
2310	    $progress->minor(0) if ($self->{showProgressBar});
2311	    $progress->max_update_rate(1) if ($self->{showProgressBar});
2312	    my $next_update=0;
2313
2314	    open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!";
2315	    my $count=0;
2316	    for my $movie (@{$self->{movies}}) {
2317		print OUT "$movie\n";
2318
2319		$count++;
2320		if ($self->{showProgressBar}) {
2321		    # re-adjust target so progress bar doesn't seem too wonky
2322		    if ( $count > $countEstimate ) {
2323			$countEstimate = $progress->target($count+100);
2324			$next_update=$progress->update($count);
2325		    }
2326		    elsif ( $count > $next_update ) {
2327			$next_update=$progress->update($count);
2328		    }
2329		}
2330	    }
2331	    $progress->update($countEstimate) if ($self->{showProgressBar});
2332	    close(OUT);
2333	    delete($self->{movies});
2334	}
2335    }
2336    elsif ( $stage == 2 ) {
2337	$self->status("parsing Directors list for stage $stage..");
2338
2339	my $countEstimate=$self->dbinfoCalcEstimate("directors", 258);
2340
2341	my $num=$self->readCastOrDirectors("Directors", $countEstimate, "$self->{imdbListFiles}->{directors}");
2342	if ( $num < 0 ) {
2343	    if ( $num == -2 ) {
2344		$self->error("you need to download $self->{imdbListFiles}->{directors} from ftp.imdb.com (see http://www.imdb.com/interfaces)");
2345	    }
2346	    return(1);
2347	}
2348	elsif ( abs($num - $countEstimate) > $countEstimate*.10 ) {
2349	    my $better=$self->dbinfoCalcBytesPerEntry("directors", $num);
2350	    $self->status("ARG estimate of $countEstimate for directors needs updating, found $num ($better bytes/entry)");
2351	}
2352	$self->dbinfoAdd("db_stat_director_count", "$num");
2353
2354	$self->status("writing stage2 data ..");
2355	{
2356	    my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2357	    my $progress=Term::ProgressBar->new({name  => "writing directors",
2358						 count => $countEstimate,
2359						 ETA   => 'linear'})
2360	      if ($self->{showProgressBar});
2361	    $progress->minor(0) if ($self->{showProgressBar});
2362	    $progress->max_update_rate(1) if ($self->{showProgressBar});
2363	    my $next_update=0;
2364
2365	    my $count=0;
2366	    open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!";
2367	    for my $key (keys %{$self->{movies}}) {
2368		my %dir;
2369		for (split('\|', $self->{movies}{$key})) {
2370		    $dir{$_}++;
2371		}
2372		my @list;
2373		for (keys %dir) {
2374		    push(@list, sprintf("%03d:%s", $dir{$_}, $_));
2375		}
2376		my $value="";
2377		for my $c (reverse sort {$a cmp $b} @list) {
2378		    my ($num, $name)=split(':', $c);
2379		    $value.=$name."|";
2380		}
2381		$value=~s/\|$//o;
2382		print OUT "$key\t$value\n";
2383
2384		$count++;
2385		if ($self->{showProgressBar}) {
2386		    # re-adjust target so progress bar doesn't seem too wonky
2387		    if ( $count > $countEstimate ) {
2388			$countEstimate = $progress->target($count+100);
2389			$next_update=$progress->update($count);
2390		    }
2391		    elsif ( $count > $next_update ) {
2392			$next_update=$progress->update($count);
2393		    }
2394		}
2395	    }
2396	    $progress->update($countEstimate) if ($self->{showProgressBar});
2397	    close(OUT);
2398	    delete($self->{movies});
2399	}
2400	#unlink("$self->{imdbDir}/stage1.data");
2401    }
2402    elsif ( $stage == 3 ) {
2403	$self->status("parsing Actors list for stage $stage..");
2404
2405	#print "re-reading movies into memory for reverse lookup..\n";
2406	my $countEstimate=$self->dbinfoCalcEstimate("actors", 449);
2407
2408	my $num=$self->readCastOrDirectors("Actors", $countEstimate, "$self->{imdbListFiles}->{actors}");
2409	if ( $num < 0 ) {
2410	    if ( $num == -2 ) {
2411		$self->error("you need to download $self->{imdbListFiles}->{actors} from ftp.imdb.com (see http://www.imdb.com/interfaces)");
2412	    }
2413	    return(1);
2414	}
2415	elsif ( abs($num - $countEstimate) > $countEstimate*.10 ) {
2416	    my $better=$self->dbinfoCalcBytesPerEntry("actors", $num);
2417	    $self->status("ARG estimate of $countEstimate for actors needs updating, found $num ($better bytes/entry)");
2418	}
2419	$self->dbinfoAdd("db_stat_actor_count", "$num");
2420
2421	$self->status("writing stage3 data ..");
2422	{
2423	    my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2424	    my $progress=Term::ProgressBar->new({name  => "writing actors",
2425						 count => $countEstimate,
2426						 ETA   => 'linear'})
2427	      if ($self->{showProgressBar});
2428	    $progress->minor(0) if ($self->{showProgressBar});
2429	    $progress->max_update_rate(1) if ($self->{showProgressBar});
2430	    my $next_update=0;
2431
2432	    my $count=0;
2433	    open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!";
2434	    for my $key (keys %{$self->{movies}}) {
2435		print OUT "$key\t$self->{movies}{$key}\n";
2436
2437		$count++;
2438		if ($self->{showProgressBar}) {
2439		    # re-adjust target so progress bar doesn't seem too wonky
2440		    if ( $count > $countEstimate ) {
2441			$countEstimate = $progress->target($count+100);
2442			$next_update=$progress->update($count);
2443		    }
2444		    elsif ( $count > $next_update ) {
2445			$next_update=$progress->update($count);
2446		    }
2447		}
2448	    }
2449	    $progress->update($countEstimate) if ($self->{showProgressBar});
2450	    close(OUT);
2451	    delete($self->{movies});
2452	}
2453    }
2454    elsif ( $stage == 4 ) {
2455	$self->status("parsing Actresses list for stage $stage..");
2456
2457	my $countEstimate=$self->dbinfoCalcEstimate("actresses", 483);
2458	my $num=$self->readCastOrDirectors("Actresses", $countEstimate, "$self->{imdbListFiles}->{actresses}");
2459	if ( $num < 0 ) {
2460	    if ( $num == -2 ) {
2461		$self->error("you need to download $self->{imdbListFiles}->{actresses} from ftp.imdb.com (see http://www.imdb.com/interfaces)");
2462	    }
2463	    return(1);
2464	}
2465	elsif ( abs($num - $countEstimate) > $countEstimate*.10 ) {
2466	    my $better=$self->dbinfoCalcBytesPerEntry("actresses", $num);
2467	    $self->status("ARG estimate of $countEstimate for actresses needs updating, found $num ($better bytes/entry)");
2468	}
2469	$self->dbinfoAdd("db_stat_actress_count", "$num");
2470
2471	$self->status("writing stage4 data ..");
2472	{
2473	    my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2474	    my $progress=Term::ProgressBar->new({name  => "writing actresses",
2475						 count => $countEstimate,
2476						 ETA   => 'linear'})
2477	      if ($self->{showProgressBar});
2478	    $progress->minor(0) if ($self->{showProgressBar});
2479	    $progress->max_update_rate(1) if ($self->{showProgressBar});
2480	    my $next_update=0;
2481
2482	    my $count=0;
2483	    open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!";
2484	    for my $key (keys %{$self->{movies}}) {
2485		print OUT "$key\t$self->{movies}{$key}\n";
2486		$count++;
2487		if ($self->{showProgressBar}) {
2488		    # re-adjust target so progress bar doesn't seem too wonky
2489		    if ( $count > $countEstimate ) {
2490			$countEstimate = $progress->target($count+100);
2491			$next_update=$progress->update($count);
2492		    }
2493		    elsif ( $count > $next_update ) {
2494			$next_update=$progress->update($count);
2495		    }
2496		}
2497	    }
2498	    $progress->update($countEstimate) if ($self->{showProgressBar});
2499	    close(OUT);
2500	    delete($self->{movies});
2501	}
2502	#unlink("$self->{imdbDir}/stage3.data");
2503    }
2504    elsif ( $stage == 5 ) {
2505	$self->status("parsing Genres list for stage $stage..");
2506	my $countEstimate=$self->dbinfoCalcEstimate("genres", 68);
2507
2508	my $num=$self->readMoviesOrGenres("Genres", $countEstimate, "$self->{imdbListFiles}->{genres}");
2509	if ( $num < 0 ) {
2510	    if ( $num == -2 ) {
2511		$self->error("you need to download $self->{imdbListFiles}->{genres} from ftp.imdb.com");
2512	    }
2513	    return(1);
2514	}
2515	elsif ( abs($num - $countEstimate) > $countEstimate*.10 ) {
2516	    my $better=$self->dbinfoCalcBytesPerEntry("genres", $num);
2517	    $self->status("ARG estimate of $countEstimate for genres needs updating, found $num ($better bytes/entry)");
2518	}
2519	$self->dbinfoAdd("db_stat_genres_count", "$num");
2520
2521	$self->status("writing stage5 data ..");
2522	{
2523	    my $countEstimate=$self->dbinfoGet("db_stat_genres_count", 0);
2524	    my $progress=Term::ProgressBar->new({name  => "writing genres",
2525						 count => $countEstimate,
2526						 ETA   => 'linear'})
2527	      if ($self->{showProgressBar});
2528	    $progress->minor(0) if ($self->{showProgressBar});
2529	    $progress->max_update_rate(1) if ($self->{showProgressBar});
2530	    my $next_update=0;
2531
2532	    open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!";
2533	    my $count=0;
2534	    for my $movie (keys %{$self->{movies}}) {
2535		print OUT "$movie\t$self->{movies}->{$movie}\n";
2536
2537		$count++;
2538		if ($self->{showProgressBar}) {
2539		    # re-adjust target so progress bar doesn't seem too wonky
2540		    if ( $count > $countEstimate ) {
2541			$countEstimate = $progress->target($count+100);
2542			$next_update=$progress->update($count);
2543		    }
2544		    elsif ( $count > $next_update ) {
2545			$next_update=$progress->update($count);
2546		    }
2547		}
2548	    }
2549	    $progress->update($countEstimate) if ($self->{showProgressBar});
2550	    close(OUT);
2551	    delete($self->{movies});
2552	}
2553    }
2554    elsif ( $stage == 6 ) {
2555	$self->status("parsing Ratings list for stage $stage..");
2556	my $countEstimate=$self->dbinfoCalcEstimate("ratings", 68);
2557
2558	my $num=$self->readRatings($countEstimate, "$self->{imdbListFiles}->{ratings}");
2559	if ( $num < 0 ) {
2560	    if ( $num == -2 ) {
2561		$self->error("you need to download $self->{imdbListFiles}->{ratings} from ftp.imdb.com");
2562	    }
2563	    return(1);
2564	}
2565	elsif ( abs($num - $countEstimate) > $countEstimate*.10 ) {
2566	    my $better=$self->dbinfoCalcBytesPerEntry("ratings", $num);
2567	    $self->status("ARG estimate of $countEstimate for ratings needs updating, found $num ($better bytes/entry)");
2568	}
2569	$self->dbinfoAdd("db_stat_ratings_count", "$num");
2570
2571	$self->status("writing stage6 data ..");
2572	{
2573	    my $countEstimate=$self->dbinfoGet("db_stat_ratings_count", 0);
2574	    my $progress=Term::ProgressBar->new({name  => "writing ratings",
2575						 count => $countEstimate,
2576						 ETA   => 'linear'})
2577	      if ($self->{showProgressBar});
2578	    $progress->minor(0) if ($self->{showProgressBar});
2579	    $progress->max_update_rate(1) if ($self->{showProgressBar});
2580	    my $next_update=0;
2581
2582	    open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!";
2583	    my $count=0;
2584	    for my $movie (keys %{$self->{movies}}) {
2585		my @value=@{$self->{movies}->{$movie}};
2586		print OUT "$movie\t$value[0]\t$value[1]\t$value[2]\n";
2587
2588		$count++;
2589		if ($self->{showProgressBar}) {
2590		    # re-adjust target so progress bar doesn't seem too wonky
2591		    if ( $count > $countEstimate ) {
2592			$countEstimate = $progress->target($count+100);
2593			$next_update=$progress->update($count);
2594		    }
2595		    elsif ( $count > $next_update ) {
2596			$next_update=$progress->update($count);
2597		    }
2598		}
2599	    }
2600	    $progress->update($countEstimate) if ($self->{showProgressBar});
2601	    close(OUT);
2602	    delete($self->{movies});
2603	}
2604    }
2605    elsif ( $stage == 7 ) {
2606	$self->status("parsing Keywords list for stage $stage..");
2607
2608	if ( !defined($self->{imdbListFiles}->{keywords}) ) {
2609	    $self->status("no keywords file downloaded, see --with-keywords details in documentation");
2610	    return(0);
2611	}
2612
2613	my $countEstimate=5630000;
2614	my $num=$self->readKeywords($countEstimate, "$self->{imdbListFiles}->{keywords}");
2615	if ( $num < 0 ) {
2616	    if ( $num == -2 ) {
2617		$self->error("you need to download $self->{imdbListFiles}->{keywords} from ftp.imdb.com");
2618	    }
2619	    return(1);
2620	}
2621	elsif ( abs($num - $countEstimate) > $countEstimate*.05 ) {
2622	    $self->status("ARG estimate of $countEstimate for keywords needs updating, found $num");
2623	}
2624	$self->dbinfoAdd("keywords_list_file",         "$self->{imdbListFiles}->{keywords}");
2625	$self->dbinfoAdd("keywords_list_file_size", -s "$self->{imdbListFiles}->{keywords}");
2626	$self->dbinfoAdd("db_stat_keywords_count", "$num");
2627
2628	$self->status("writing stage$stage data ..");
2629	{
2630	    my $countEstimate=$self->dbinfoGet("db_stat_keywords_count", 0);
2631	    my $progress=Term::ProgressBar->new({name  => "writing keywords",
2632						 count => $countEstimate,
2633						 ETA   => 'linear'})
2634	      if ($self->{showProgressBar});
2635	    $progress->minor(0) if ($self->{showProgressBar});
2636	    $progress->max_update_rate(1) if ($self->{showProgressBar});
2637	    my $next_update=0;
2638
2639	    open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!";
2640
2641	    my $count=0;
2642	    for my $movie (keys %{$self->{movies}}) {
2643		print OUT "$movie\t$self->{movies}->{$movie}\n";
2644
2645		$count++;
2646		if ($self->{showProgressBar}) {
2647		    # re-adjust target so progress bar doesn't seem too wonky
2648		    if ( $count > $countEstimate ) {
2649			$countEstimate = $progress->target($count+100);
2650			$next_update=$progress->update($count);
2651		    }
2652		    elsif ( $count > $next_update ) {
2653			$next_update=$progress->update($count);
2654		    }
2655		}
2656	    }
2657	    $progress->update($countEstimate) if ($self->{showProgressBar});
2658	    close(OUT);
2659	    delete($self->{movies});
2660	}
2661    }
2662    elsif ( $stage == 8 ) {
2663	$self->status("parsing Plot list for stage $stage..");
2664
2665	if ( !defined($self->{imdbListFiles}->{plot}) ) {
2666	    $self->status("no plot file downloaded, see --with-plot details in documentation");
2667	    return(0);
2668	}
2669
2670	my $countEstimate=222222;
2671	my $num=$self->readPlots($countEstimate, "$self->{imdbListFiles}->{plot}");
2672	if ( $num < 0 ) {
2673	    if ( $num == -2 ) {
2674		$self->error("you need to download $self->{imdbListFiles}->{plot} from ftp.imdb.com");
2675	    }
2676	    return(1);
2677	}
2678	elsif ( abs($num - $countEstimate) > $countEstimate*.05 ) {
2679	    $self->status("ARG estimate of $countEstimate for plots needs updating, found $num");
2680	}
2681	$self->dbinfoAdd("plots_list_file",         "$self->{imdbListFiles}->{plot}");
2682	$self->dbinfoAdd("plots_list_file_size", -s "$self->{imdbListFiles}->{plot}");
2683	$self->dbinfoAdd("db_stat_plots_count", "$num");
2684
2685	$self->status("writing stage$stage data ..");
2686	{
2687	    my $countEstimate=$self->dbinfoGet("db_stat_plots_count", 0);
2688	    my $progress=Term::ProgressBar->new({name  => "writing plots",
2689						 count => $countEstimate,
2690						 ETA   => 'linear'})
2691	      if ($self->{showProgressBar});
2692	    $progress->minor(0) if ($self->{showProgressBar});
2693	    $progress->max_update_rate(1) if ($self->{showProgressBar});
2694	    my $next_update=0;
2695
2696	    open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!";
2697
2698	    my $count=0;
2699	    for my $movie (keys %{$self->{movies}}) {
2700		print OUT "$movie\t$self->{movies}->{$movie}\n";
2701
2702		$count++;
2703		if ($self->{showProgressBar}) {
2704		    # re-adjust target so progress bar doesn't seem too wonky
2705		    if ( $count > $countEstimate ) {
2706			$countEstimate = $progress->target($count+100);
2707			$next_update=$progress->update($count);
2708		    }
2709		    elsif ( $count > $next_update ) {
2710			$next_update=$progress->update($count);
2711		    }
2712		}
2713	    }
2714	    $progress->update($countEstimate) if ($self->{showProgressBar});
2715	    close(OUT);
2716	    delete($self->{movies});
2717	}
2718    }
2719    elsif ( $stage == $self->{stageLast} ) {
2720	my $tab=sprintf("\t");
2721
2722	$self->status("indexing all previous stage's data for stage ".$self->{stageLast}."..");
2723
2724	$self->status("parsing stage 1 data (movie list)..");
2725	my %movies;
2726	{
2727	    my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2728	    my $progress=Term::ProgressBar->new({name  => "reading titles",
2729						 count => $countEstimate,
2730						 ETA   => 'linear'})
2731	      if ($self->{showProgressBar});
2732	    $progress->minor(0) if ($self->{showProgressBar});
2733	    $progress->max_update_rate(1) if ($self->{showProgressBar});
2734	    my $next_update=0;
2735
2736	    open(IN, "< $self->{imdbDir}/stage1.data") || die "$self->{imdbDir}/stage1.data:$!";
2737	    while(<IN>) {
2738		chop();
2739		$movies{$_}="";
2740
2741		if ($self->{showProgressBar}) {
2742		    # re-adjust target so progress bar doesn't seem too wonky
2743		    if ( $. > $countEstimate ) {
2744			$countEstimate = $progress->target($.+100);
2745			$next_update=$progress->update($.);
2746		    }
2747		    elsif ( $. > $next_update ) {
2748			$next_update=$progress->update($.);
2749		    }
2750		}
2751	    }
2752	    close(IN);
2753	    $progress->update($countEstimate) if ($self->{showProgressBar});
2754	}
2755
2756	$self->status("merging in stage 2 data (directors)..");
2757	if ( 1 ) {
2758	    my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2759	    my $progress=Term::ProgressBar->new({name  => "merging directors",
2760						 count => $countEstimate,
2761						 ETA   => 'linear'})
2762	      if ($self->{showProgressBar});
2763	    $progress->minor(0) if ($self->{showProgressBar});
2764	    $progress->max_update_rate(1) if ($self->{showProgressBar});
2765	    my $next_update=0;
2766
2767	    open(IN, "< $self->{imdbDir}/stage2.data") || die "$self->{imdbDir}/stage2.data:$!";
2768	    while(<IN>) {
2769		chop();
2770		s/^([^\t]+)\t//o;
2771		if ( !defined($movies{$1}) ) {
2772		    $self->error("directors list references unidentified title '$1'");
2773		    next;
2774		}
2775		$movies{$1}=$_;
2776
2777		if ($self->{showProgressBar}) {
2778		    # re-adjust target so progress bar doesn't seem too wonky
2779		    if ( $. > $countEstimate ) {
2780			$countEstimate = $progress->target($.+100);
2781			$next_update=$progress->update($.);
2782		    }
2783		    elsif ( $. > $next_update ) {
2784			$next_update=$progress->update($.);
2785		    }
2786		}
2787	    }
2788	    $progress->update($countEstimate) if ($self->{showProgressBar});
2789	    close(IN);
2790	}
2791
2792	if ( 1 ) {
2793	    # fill in default for movies we didn't have a director for
2794	    for my $key (keys %movies) {
2795		if ( !length($movies{$key})) {
2796		    $movies{$key}="<>";
2797		}
2798	    }
2799	}
2800
2801	$self->status("merging in stage 3 data (actors)..");
2802	if ( 1 ) {
2803	    my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2804	    my $progress=Term::ProgressBar->new({name  => "merging actors",
2805						 count => $countEstimate,
2806						 ETA   => 'linear'})
2807	      if ($self->{showProgressBar});
2808	    $progress->minor(0) if ($self->{showProgressBar});
2809	    $progress->max_update_rate(1) if ($self->{showProgressBar});
2810	    my $next_update=0;
2811
2812	    open(IN, "< $self->{imdbDir}/stage3.data") || die "$self->{imdbDir}/stage3.data:$!";
2813	    while(<IN>) {
2814		chop();
2815		s/^([^\t]+)\t//o;
2816		my $dbkey=$1;
2817		my $val=$movies{$dbkey};
2818		if ( !defined($val) ) {
2819		    $self->error("actors list references unidentified title '$dbkey'");
2820		    next;
2821		}
2822		if ( $val=~m/$tab/o ) {
2823		    $movies{$dbkey}=$val."|".$_;
2824		}
2825		else {
2826		    $movies{$dbkey}=$val.$tab.$_;
2827		}
2828		if ($self->{showProgressBar}) {
2829		    # re-adjust target so progress bar doesn't seem too wonky
2830		    if ( $. > $countEstimate ) {
2831			$countEstimate = $progress->target($.+100);
2832			$next_update=$progress->update($.);
2833		    }
2834		    elsif ( $. > $next_update ) {
2835			$next_update=$progress->update($.);
2836		    }
2837		}
2838	    }
2839	    $progress->update($countEstimate) if ($self->{showProgressBar});
2840	    close(IN);
2841	}
2842
2843	$self->status("merging in stage 4 data (actresses)..");
2844	if ( 1 ) {
2845	    my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
2846	    my $progress=Term::ProgressBar->new({name  => "merging actresses",
2847						 count => $countEstimate,
2848						 ETA   => 'linear'})
2849	      if ($self->{showProgressBar});
2850	    $progress->minor(0) if ($self->{showProgressBar});
2851	    $progress->max_update_rate(1) if ($self->{showProgressBar});
2852	    my $next_update=0;
2853
2854	    open(IN, "< $self->{imdbDir}/stage4.data") || die "$self->{imdbDir}/stage4.data:$!";
2855	    while(<IN>) {
2856		chop();
2857		s/^([^\t]+)\t//o;
2858		my $dbkey=$1;
2859		my $val=$movies{$dbkey};
2860		if ( !defined($val) ) {
2861		    $self->error("actresses list references unidentified title '$dbkey'");
2862		    next;
2863		}
2864		if ( $val=~m/$tab/o ) {
2865		    $movies{$dbkey}=$val."|".$_;
2866		}
2867		else {
2868		    $movies{$dbkey}=$val.$tab.$_;
2869		}
2870		if ($self->{showProgressBar}) {
2871		    # re-adjust target so progress bar doesn't seem too wonky
2872		    if ( $. > $countEstimate ) {
2873			$countEstimate = $progress->target($.+100);
2874			$next_update=$progress->update($.);
2875		    }
2876		    elsif ( $. > $next_update ) {
2877			$next_update=$progress->update($.);
2878		    }
2879		}
2880	    }
2881	    $progress->update($countEstimate) if ($self->{showProgressBar});
2882	    close(IN);
2883	}
2884	if ( 1 ) {
2885	    # fill in placeholder if no actors were found
2886	    for my $key (keys %movies) {
2887		if ( !($movies{$key}=~m/$tab/o) ) {
2888		    $movies{$key}.=$tab."<>";
2889		}
2890	    }
2891	}
2892
2893	$self->status("merging in stage 5 data (genres)..");
2894	if ( 1 ) {
2895	    my $countEstimate=$self->dbinfoGet("db_stat_genres_count", 1);  # '1' prevents the spurious "(nothing to do)" msg
2896	    my $progress=Term::ProgressBar->new({name  => "merging genres",
2897						 count => $countEstimate,
2898						 ETA   => 'linear'})
2899	      if ($self->{showProgressBar});
2900	    $progress->minor(0) if ($self->{showProgressBar});
2901	    $progress->max_update_rate(1) if ($self->{showProgressBar});
2902	    my $next_update=0;
2903
2904	    open(IN, "< $self->{imdbDir}/stage5.data") || die "$self->{imdbDir}/stage5.data:$!";
2905	    while(<IN>) {
2906		chop();
2907		s/^([^\t]+)\t//o;
2908		my $dbkey=$1;
2909		my $genres=$_;
2910		my $val=$movies{$dbkey};
2911		if ( !defined($val) ) {
2912		    $self->error("genres list references unidentified title '$1'");
2913		    next;
2914		}
2915		$movies{$dbkey}.=$tab.$genres;
2916
2917		if ($self->{showProgressBar}) {
2918		    # re-adjust target so progress bar doesn't seem too wonky
2919		    if ( $. > $countEstimate ) {
2920			$countEstimate = $progress->target($.+100);
2921			$next_update=$progress->update($.);
2922		    }
2923		    elsif ( $. > $next_update ) {
2924			$next_update=$progress->update($.);
2925		    }
2926		}
2927	    }
2928	    $progress->update($countEstimate) if ($self->{showProgressBar});
2929	    close(IN);
2930	}
2931
2932	if ( 1 ) {
2933	    # fill in placeholder if no genres were found
2934	    for my $key (keys %movies) {
2935		my $val=$movies{$key};
2936		my $t=index($val, $tab);
2937		if ( $t == -1 ) {
2938		    die "corrupt entry '$key' '$val'";
2939		}
2940		if ( index($val, $tab, $t+1) == -1 ) {
2941		    $movies{$key}.=$tab."<>";
2942		}
2943	    }
2944	}
2945
2946	$self->status("merging in stage 6 data (ratings)..");
2947	if ( 1 ) {
2948	    my $countEstimate=$self->dbinfoGet("db_stat_ratings_count", 1);  # '1' prevents the spurious "(nothing to do)" msg
2949	    my $progress=Term::ProgressBar->new({name  => "merging ratings",
2950						 count => $countEstimate,
2951						 ETA   => 'linear'})
2952	      if ($self->{showProgressBar});
2953	    $progress->minor(0) if ($self->{showProgressBar});
2954	    $progress->max_update_rate(1) if ($self->{showProgressBar});
2955	    my $next_update=0;
2956
2957	    open(IN, "< $self->{imdbDir}/stage6.data") || die "$self->{imdbDir}/stage6.data:$!";
2958	    while(<IN>) {
2959		chop();
2960		s/^([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)$//o;
2961		my $dbkey=$1;
2962		my ($ratingDist, $ratingVotes, $ratingRank)=($2,$3,$4);
2963
2964		my $val=$movies{$dbkey};
2965		if ( !defined($val) ) {
2966		    $self->error("ratings list references unidentified title '$1'");
2967		    next;
2968		}
2969		$movies{$dbkey}.=$tab.$ratingDist.$tab.$ratingVotes.$tab.$ratingRank;
2970
2971		if ($self->{showProgressBar}) {
2972		    # re-adjust target so progress bar doesn't seem too wonky
2973		    if ( $. > $countEstimate ) {
2974			$countEstimate = $progress->target($.+100);
2975			$next_update=$progress->update($.);
2976		    }
2977		    elsif ( $. > $next_update ) {
2978			$next_update=$progress->update($.);
2979		    }
2980		}
2981	    }
2982	    $progress->update($countEstimate) if ($self->{showProgressBar});
2983	    close(IN);
2984	}
2985
2986	if ( 1 ) {
2987	    # fill in placeholder if no genres were found
2988	    for my $key (keys %movies) {
2989		my $val=$movies{$key};
2990
2991		my $t=index($val, $tab);
2992		if ( $t == -1  ) {
2993		    die "corrupt entry '$key' '$val'";
2994		}
2995		my $j=index($val, $tab, $t+1);
2996		if ( $j == -1  ) {
2997		    die "corrupt entry '$key' '$val'";
2998		}
2999		if ( index($val, $tab, $j+1) == -1 ) {
3000		    $movies{$key}.=$tab."<>".$tab."<>".$tab."<>";
3001		}
3002	    }
3003	}
3004
3005	$self->status("merging in stage 7 data (keywords)..");
3006	#if ( 1 ) {         # this stage is optional
3007        if ( -f "$self->{imdbDir}/stage7.data" ) {
3008	    my $countEstimate=$self->dbinfoGet("db_stat_keywords_count", 1);  # '1' prevents the spurious "(nothing to do)" msg
3009	    my $progress=Term::ProgressBar->new({name  => "merging keywords",
3010						 count => $countEstimate,
3011						 ETA   => 'linear'})
3012	      if ($self->{showProgressBar});
3013	    $progress->minor(0) if ($self->{showProgressBar});
3014	    $progress->max_update_rate(1) if ($self->{showProgressBar});
3015	    my $next_update=0;
3016
3017	    open(IN, "< $self->{imdbDir}/stage7.data") || die "$self->{imdbDir}/stage7.data:$!";
3018	    while(<IN>) {
3019		chop();
3020		s/^([^\t]+)\t+//o;
3021		my $dbkey=$1;
3022		my $keywords=$_;
3023		if ( !defined($movies{$dbkey}) ) {
3024		    $self->error("keywords list references unidentified title '$1'");
3025		    next;
3026		}
3027		$movies{$dbkey}.=$tab.$keywords;
3028
3029		if ($self->{showProgressBar}) {
3030		    # re-adjust target so progress bar doesn't seem too wonky
3031		    if ( $. > $countEstimate ) {
3032			$countEstimate = $progress->target($.+100);
3033			$next_update=$progress->update($.);
3034		    }
3035		    elsif ( $. > $next_update ) {
3036			$next_update=$progress->update($.);
3037		    }
3038		}
3039	    }
3040	    $progress->update($countEstimate) if ($self->{showProgressBar});
3041	    close(IN);
3042	}
3043
3044	if ( 1 ) {
3045	    # fill in default for movies we didn't have any keywords for
3046	    for my $key (keys %movies) {
3047		my $val=$movies{$key};
3048		#keyword is 6th entry
3049		my $t = 0;
3050		for my $i (0..4) {
3051		    $t=index($val, $tab, $t);
3052		    if ( $t == -1 ) {
3053		    	die "Corrupt entry '$key' '$val'";
3054		    }
3055		    $t+=1;
3056		}
3057		if ( index($val, $tab, $t) == -1 ) {
3058		    $movies{$key}.=$tab."<>";
3059		}
3060	    }
3061	}
3062
3063	$self->status("merging in stage 8 data (plots)..");
3064	#if ( 1 ) {         # this stage is optional
3065        if ( -f "$self->{imdbDir}/stage8.data" ) {
3066	    my $countEstimate=$self->dbinfoGet("db_stat_plots_count", 1);  # '1' prevents the spurious "(nothing to do)" msg
3067	    my $progress=Term::ProgressBar->new({name  => "merging plots",
3068						 count => $countEstimate,
3069						 ETA   => 'linear'})
3070	      if ($self->{showProgressBar});
3071	    $progress->minor(0) if ($self->{showProgressBar});
3072	    $progress->max_update_rate(1) if ($self->{showProgressBar});
3073	    my $next_update=0;
3074
3075	    open(IN, "< $self->{imdbDir}/stage8.data") || die "$self->{imdbDir}/stage8.data:$!";
3076	    while(<IN>) {
3077		chop();
3078		s/^([^\t]+)\t+//o;
3079		my $dbkey=$1;
3080		my $plot=$_;
3081		if ( !defined($movies{$dbkey}) ) {
3082		    $self->error("plot list references unidentified title '$1'");
3083		    next;
3084		}
3085		$movies{$dbkey}.=$tab.$plot;
3086
3087		if ($self->{showProgressBar}) {
3088		    # re-adjust target so progress bar doesn't seem too wonky
3089		    if ( $. > $countEstimate ) {
3090			$countEstimate = $progress->target($.+100);
3091			$next_update=$progress->update($.);
3092		    }
3093		    elsif ( $. > $next_update ) {
3094			$next_update=$progress->update($.);
3095		    }
3096		}
3097	    }
3098	    $progress->update($countEstimate) if ($self->{showProgressBar});
3099	    close(IN);
3100	}
3101	if ( 1 ) {
3102	    # fill in default for movies we didn't have any plot for
3103	    for my $key (keys %movies) {
3104		my $val=$movies{$key};
3105		#plot is 7th entry
3106		my $t = 0;
3107		for my $i (0..5) {
3108		    $t=index($val, $tab, $t);
3109		    if ( $t == -1 ) {
3110		    	die "Corrupt entry '$key' '$val'";
3111		    }
3112		    $t+=1;
3113		}
3114		if ( index($val, $tab, $t) == -1 ) {
3115		    $movies{$key}.=$tab."<>";
3116		}
3117	    }
3118	}
3119
3120	#unlink("$self->{imdbDir}/stage1.data");
3121	#unlink("$self->{imdbDir}/stage2.data");
3122	#unlink("$self->{imdbDir}/stage3.data");
3123
3124        # ---------------------------------------------------------------------------------------
3125
3126
3127	#
3128	# note: not all movies end up with a cast, but we include them anyway.
3129	#
3130
3131	my %nmovies;
3132	{
3133	    my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
3134	    my $progress=Term::ProgressBar->new({name  => "computing index",
3135						 count => $countEstimate,
3136						 ETA   => 'linear'})
3137	      if ($self->{showProgressBar});
3138	    $progress->minor(0) if ($self->{showProgressBar});
3139	    $progress->max_update_rate(1) if ($self->{showProgressBar});
3140	    my $next_update=0;
3141
3142	    my $count=0;
3143	    for my $key (keys %movies) {
3144		my $dbkey=$key;
3145
3146		# drop episode information - ex: {Twelve Angry Men (1954)}
3147		$dbkey=~s/\s*\{[^\}]+\}//go;
3148
3149		# todo - this would make things easier
3150		# change double-quotes around title to be (made-for-tv) suffix instead
3151		if ( $dbkey=~m/^\"/o && #"
3152		     $dbkey=~m/\"\s*\(/o ) { #"
3153		    $dbkey.=" (tv_series)";
3154		}
3155		# how rude, some entries have (TV) appearing more than once.
3156		$dbkey=~s/\(TV\)\s*\(TV\)$/(TV)/o;
3157
3158		my $qualifier;
3159		if ( $dbkey=~s/\s+\(TV\)$//o ) {
3160		    $qualifier="tv_movie";
3161		}
3162		elsif ( $dbkey=~s/\s+\(mini\) \(tv_series\)$// ) {
3163		    $qualifier="tv_mini_series";
3164		}
3165		elsif ( $dbkey=~s/\s+\(tv_series\)$// ) {
3166		    $qualifier="tv_series";
3167		}
3168		elsif ( $dbkey=~s/\s+\(mini\)$//o ) {
3169		    $qualifier="tv_mini_series";
3170		}
3171		elsif ( $dbkey=~s/\s+\(V\)$//o ) {
3172		    $qualifier="video_movie";
3173		}
3174		elsif ( $dbkey=~s/\s+\(VG\)$//o ) {
3175		    #$qualifier="video_game";
3176		    delete($movies{$key});
3177		    next;
3178		}
3179		else {
3180		    $qualifier="movie";
3181		}
3182		#if ( $dbkey=~s/\s+\((tv_series|tv_mini_series|tv_movie|video_movie|video_game)\)$//o ) {
3183		 #   $qualifier=$1;
3184		#}
3185		my $year;
3186		my $title=$dbkey;
3187
3188		if ( $title=~m/^\"/o && $title=~m/\"\s*\(/o ) { #"
3189		    $title=~s/^\"//o; #"
3190		    $title=~s/\"(\s*\()/$1/o; #"
3191		}
3192
3193		if ( $title=~s/\s+\((\d\d\d\d)\)$//o ||
3194		     $title=~s/\s+\((\d\d\d\d)\/[IVX]+\)$//o ) {
3195		    $year=$1;
3196		}
3197		elsif ( $title=~s/\s+\((\?\?\?\?)\)$//o ||
3198			$title=~s/\s+\((\?\?\?\?)\/[IVX]+\)$//o ) {
3199		    $year="0000";
3200		}
3201		else {
3202		    $self->error("movie list format failed to decode year from title '$title'");
3203		    $year="0000";
3204		}
3205		$title=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og;
3206
3207		my $hashkey=lc("$title ($year)");
3208		$hashkey=~s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/oeg;
3209
3210		if ( defined($movies{$hashkey}) ) {
3211		    die "unable to place moviedb key for $key, report to xmltv-devel\@lists.sf.net";
3212		}
3213		die "title \"$title\" contains a tab" if ( $title=~m/\t/o );
3214		#print "key:$dbkey\n\ttitle=$title\n\tyear=$year\n\tqualifier=$qualifier\n";
3215		#print "key $key: value=\"$movies{$key}\"\n";
3216
3217		$nmovies{$hashkey}=$dbkey.$tab.$year.$tab.$qualifier.$tab.delete($movies{$key});
3218		$count++;
3219
3220		if ($self->{showProgressBar}) {
3221		    # re-adjust target so progress bar doesn't seem too wonky
3222		    if ( $count > $countEstimate ) {
3223			$countEstimate = $progress->target($count+100);
3224			$next_update=$progress->update($count);
3225		    }
3226		    elsif ( $count > $next_update ) {
3227			$next_update=$progress->update($count);
3228		    }
3229		}
3230	    }
3231	    $progress->update($countEstimate) if ($self->{showProgressBar});
3232
3233	    if ( scalar(keys %movies) != 0 ) {
3234		die "what happened, we have keys left ?";
3235	    }
3236	    undef(%movies);
3237	}
3238
3239	{
3240	    my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0);
3241	    my $progress=Term::ProgressBar->new({name  => "writing database",
3242						 count => $countEstimate,
3243						 ETA   => 'linear'})
3244	      if ($self->{showProgressBar});
3245	    $progress->minor(0) if ($self->{showProgressBar});
3246	    $progress->max_update_rate(1) if ($self->{showProgressBar});
3247	    my $next_update=0;
3248
3249	    open(IDX, "> $self->{moviedbIndex}") || die "$self->{moviedbIndex}:$!";
3250	    open(DAT, "> $self->{moviedbData}") || die "$self->{moviedbData}:$!";
3251	    my $count=0;
3252	    for my $key (sort {$a cmp $b} keys %nmovies) {
3253		my $val=delete($nmovies{$key});
3254		#print "movie $key: $val\n";
3255		#$val=~s/^([^\t]+)\t([^\t]+)\t([^\t]+)\t//o || die "internal failure ($key:$val)";
3256		my ($dbkey, $year, $qualifier,$directors,$actors,@rest)=split('\t', $val);
3257		#die ("no 1") if ( !defined($dbkey));
3258		#die ("no 2") if ( !defined($year));
3259		#die ("no 3") if ( !defined($qualifier));
3260		#die ("no 4") if ( !defined($directors));
3261		#die ("no 5") if ( !defined($actors));
3262		#print "key:$key\n\ttitle=$dbkey\n\tyear=$year\n\tqualifier=$qualifier\n";
3263
3264		#my ($directors, $actors)=split('\t', $val);
3265
3266		my $details="";
3267
3268		if ( $directors eq "<>" ) {
3269		    $details.="<>";
3270		}
3271		else {
3272		    # sort directors by last name, removing duplicates
3273		    my $last='';
3274		    for my $name (sort {$a cmp $b} split('\|', $directors)) {
3275			if ( $name ne $last ) {
3276			    $details.="$name|";
3277			    $last=$name;
3278			}
3279		    }
3280		    $details=~s/\|$//o;
3281		}
3282
3283		#print "      $dbkey: $val\n";
3284		if ( $actors eq "<>" ) {
3285		    $details.=$tab."<>";
3286		}
3287		else {
3288		    $details.=$tab;
3289
3290		    # sort actors by billing, removing repeated entries
3291		    # be warned, two actors may have the same billing level
3292		    my $last='';
3293		    for my $c (sort {$a cmp $b} split('\|', $actors)) {
3294			my ($billing, $name)=split(':', $c);
3295			# remove Host/Narrators from end
3296			# BUG - should remove (I)'s from actors/actresses names when details are generated
3297			$name=~s/\s\([IVX]+\)\[/\[/o;
3298			$name=~s/\s\([IVX]+\)$//o;
3299
3300			if ( $name ne $last ) {
3301			    $details.="$name|";
3302			    $last=$name;
3303			}
3304			#print "      $c: split gives'$billing' and '$name'\n";
3305		    }
3306		    $details=~s/\|$//o;
3307		}
3308		$count++;
3309		my $lineno=sprintf("%07d", $count);
3310		print IDX $key."\t".$dbkey."\t".$year."\t".$qualifier."\t".$lineno."\n";
3311		print DAT $lineno.":".$details."\t".join($tab, @rest)."\n";
3312
3313		if ($self->{showProgressBar}) {
3314		    # re-adjust target so progress bar doesn't seem too wonky
3315		    if ( $count > $countEstimate ) {
3316			$countEstimate = $progress->target($count+100);
3317			$next_update=$progress->update($count);
3318		    }
3319		    elsif ( $count > $next_update ) {
3320			$next_update=$progress->update($count);
3321		    }
3322		}
3323	    }
3324	    $progress->update($countEstimate) if ($self->{showProgressBar});
3325	    close(DAT);
3326	    close(IDX);
3327	}
3328
3329	$self->dbinfoAdd("db_version", $XMLTV::IMDB::VERSION);
3330
3331	if ( $self->dbinfoSave() ) {
3332	    $self->error("$self->{moviedbInfo}:$!");
3333	    return(1);
3334	}
3335
3336	$self->status("running quick sanity check on database indexes...");
3337	my $imdb=new XMLTV::IMDB('imdbDir' => $self->{imdbDir},
3338				 'verbose' => $self->{verbose});
3339
3340	if ( -e "$self->{moviedbOffline}" ) {
3341	    unlink("$self->{moviedbOffline}");
3342	}
3343
3344	if ( my $errline=$imdb->sanityCheckDatabase() ) {
3345	    open(OFF, "> $self->{moviedbOffline}") || die "$self->{moviedbOffline}:$!";
3346	    print OFF $errline."\n";
3347	    print OFF "one of the prep stages' must have produced corrupt data\n";
3348	    print OFF "report the following details to xmltv-devel\@lists.sf.net\n";
3349
3350	    my $info=XMLTV::IMDB::loadDBInfo($self->{moviedbInfo});
3351	    if ( ref $info eq 'SCALAR' ) {
3352		print OFF "\tdbinfo file corrupt\n";
3353		print OFF "\t$info";
3354	    }
3355	    else {
3356		for my $key (sort keys %{$info}) {
3357		    print OFF "\t$key:$info->{$key}\n";
3358		}
3359	    }
3360	    print OFF "database taken offline\n";
3361	    close(OFF);
3362	    open(OFF, "< $self->{moviedbOffline}") || die "$self->{moviedbOffline}:$!";
3363	    while(<OFF>) {
3364		chop();
3365		$self->error($_);
3366	    }
3367	    close(OFF);
3368	    return(1);
3369	}
3370	$self->status("sanity intact :)");
3371    }
3372    else {
3373	$self->error("tv_imdb: invalid stage $stage: only 1-".$self->{stageLast}." are valid");
3374	return(1);
3375    }
3376
3377    $self->dbinfoAdd("seconds_to_complete_prep_stage_$stage", (time()-$startTime));
3378    if ( $self->dbinfoSave() ) {
3379	$self->error("$self->{moviedbInfo}:$!");
3380	return(1);
3381    }
3382    return(0);
3383}
3384
3385sub crunchStage($$)
3386{
3387    my ($self, $stage)=@_;
3388
3389    if ( $stage == $self->{stageLast} ) {
3390         # check all the pre-requisite stages have been run
3391        for (my $st=1 ; $st < $self->{stageLast}; $st++ ) {
3392	if ( !$self->stageComplete($st) ) {
3393                    #$self->error("prep stages must be run in sequence..");
3394	    $self->error("prepStage $st either has never been run or failed");
3395                if ( grep { $_ == $st } values %{$self->{optionalStages}} ) {
3396                    $self->error("data for this stage will NOT be added");
3397                } else {
3398	    $self->error("rerun tv_imdb with --prepStage=$st");
3399	    return(1);
3400	}
3401    }
3402        }
3403    }
3404
3405    if ( -f "$self->{moviedbInfo}" && $stage != 1 ) {
3406	my $ret=$self->dbinfoLoad();
3407	if ( $ret ) {
3408	    $self->error($ret);
3409	    return(1);
3410	}
3411    }
3412
3413    $self->redirect("$self->{imdbDir}/stage$stage.log") || return(1);
3414    my $ret=$self->invokeStage($stage);
3415    $self->redirect(undef);
3416
3417    if ( $ret == 0 ) {
3418	if ( $self->{errorCountInLog} == 0 ) {
3419	    $self->status("prep stage $stage succeeded with no errors");
3420	}
3421	else {
3422	    $self->status("prep stage $stage succeeded with $self->{errorCountInLog} errors in $self->{imdbDir}/stage$stage.log");
3423	    if ( $stage == $self->{stageLast} && $self->{errorCountInLog} > 30 && $self->{errorCountInLog} < 80 ) {
3424		$self->status("this stage commonly produces around 60 (or so) warnings because of imdb");
3425		$self->status("list file inconsistancies, they can usually be safely ignored");
3426	    }
3427	}
3428    }
3429    else {
3430	if ( $self->{errorCountInLog} == 0 ) {
3431	    $self->status("prep stage $stage failed (with no logged errors)");
3432	}
3433	else {
3434	    $self->status("prep stage $stage failed with $self->{errorCountInLog} errors in $self->{imdbDir}/stage$stage.log");
3435	}
3436    }
3437    return($ret);
3438}
3439
34401;
3441