1# 2# $Id: IMDB.pm,v 1.69 2015/07/12 00:59:01 knowledgejunkie Exp $ 3# 4# The IMDB file contains two packages: 5# 1. XMLTV::IMDB::Cruncher package which parses and manages IMDB "lists" files 6# from ftp.imdb.com 7# 2. XMLTV::IMDB package that uses data files from the Cruncher package to 8# update/add details to XMLTV programme nodes. 9# 10# FUTURE - multiple hits on the same 'title only' could try and look for 11# character names matching from description to imdb.com character 12# names. 13# 14# FUTURE - multiple hits on 'title only' should probably pick latest 15# tv series over any older ones. May make for better guesses. 16# 17# BUG - we identify 'presenters' by the word "Host" appearing in the character 18# description. For some movies, character names include the word Host. 19# ex. Animal, The (2001) has a character named "Badger Milk Host". 20# 21# BUG - if there is a matching title with > 1 entry (say made for tv-movie and 22# at tv-mini series) made in the same year (or even "close" years) it is 23# possible for us to pick the wrong one we should pick the one with the 24# closest year, not just the first closest match based on the result ordering 25# for instance Ghost Busters was made in 1984, and into a tv series in 26# 1986. if we have a list of GhostBusters 1983, we should pick the 1984 movie 27# and not 1986 tv series...maybe :) but currently we'll pick the first 28# returned close enough match instead of trying the closest date match of 29# the approx hits. 30# 31 32use strict; 33 34package XMLTV::IMDB; 35 36use open ':encoding(iso-8859-1)'; # try to enforce file encoding (does this work in Perl <5.8.1? ) 37 38# 39# HISTORY 40# .6 = what was here for the longest time 41# .7 = fixed file size est calculations 42# = moviedb.info now includes _file_size_uncompressed values for each downloaded file 43# .8 = updated file size est calculations 44# = moviedb.dat directors and actors list no longer include repeated names (which mostly 45# occured in episodic tv programs (reported by Alexy Khrabrov) 46# .9 = added keywords data 47# .10 = added plot data 48# 49our $VERSION = '0.10'; # version number of database 50 51sub new 52{ 53 my ($type) = shift; 54 my $self={ @_ }; # remaining args become attributes 55 56 for ('imdbDir', 'verbose') { 57 die "invalid usage - no $_" if ( !defined($self->{$_})); 58 } 59 #$self->{verbose}=2; 60 $self->{replaceDates}=0 if ( !defined($self->{replaceDates})); 61 $self->{replaceTitles}=0 if ( !defined($self->{replaceTitles})); 62 $self->{replaceCategories}=0 if ( !defined($self->{replaceCategories})); 63 $self->{replaceKeywords}=0 if ( !defined($self->{replaceKeywords})); 64 $self->{replaceURLs}=0 if ( !defined($self->{replaceURLs})); 65 $self->{replaceDirectors}=1 if ( !defined($self->{replaceDirectors})); 66 $self->{replaceActors}=0 if ( !defined($self->{replaceActors})); 67 $self->{replacePresentors}=1 if ( !defined($self->{replacePresentors})); 68 $self->{replaceCommentators}=1 if ( !defined($self->{replaceCommentators})); 69 $self->{replaceStarRatings}=0 if ( !defined($self->{replaceStarRatings})); 70 $self->{replacePlot}=0 if ( !defined($self->{replacePlot})); 71 72 $self->{updateDates}=1 if ( !defined($self->{updateDates})); 73 $self->{updateTitles}=1 if ( !defined($self->{updateTitles})); 74 $self->{updateCategories}=1 if ( !defined($self->{updateCategories})); 75 $self->{updateCategoriesWithGenres}=1 if ( !defined($self->{updateCategoriesWithGenres})); 76 $self->{updateKeywords}=0 if ( !defined($self->{updateKeywords})); # default is to NOT add keywords 77 $self->{updateURLs}=1 if ( !defined($self->{updateURLs})); 78 $self->{updateDirectors}=1 if ( !defined($self->{updateDirectors})); 79 $self->{updateActors}=1 if ( !defined($self->{updateActors})); 80 $self->{updatePresentors}=1 if ( !defined($self->{updatePresentors})); 81 $self->{updateCommentators}=1 if ( !defined($self->{updateCommentators})); 82 $self->{updateStarRatings}=1 if ( !defined($self->{updateStarRatings})); 83 $self->{updatePlot}=0 if ( !defined($self->{updatePlot})); # default is to NOT add plot 84 85 $self->{numActors}=3 if ( !defined($self->{numActors})); # default is to add top 3 actors 86 87 $self->{moviedbIndex}="$self->{imdbDir}/moviedb.idx"; 88 $self->{moviedbData}="$self->{imdbDir}/moviedb.dat"; 89 $self->{moviedbInfo}="$self->{imdbDir}/moviedb.info"; 90 $self->{moviedbOffline}="$self->{imdbDir}/moviedb.offline"; 91 92 # default is not to cache lookups 93 $self->{cacheLookups}=0 if ( !defined($self->{cacheLookups}) ); 94 $self->{cacheLookupSize}=0 if ( !defined($self->{cacheLookupSize}) ); 95 96 $self->{cachedLookups}->{tv_series}->{_cacheSize_}=0; 97 98 bless($self, $type); 99 100 $self->{categories}={'movie' =>'Movie', 101 'tv_movie' =>'TV Movie', # made for tv 102 'video_movie' =>'Video Movie', # went straight to video or was made for it 103 'tv_series' =>'TV Series', 104 'tv_mini_series' =>'TV Mini Series'}; 105 106 $self->{stats}->{programCount}=0; 107 108 for my $cat (keys %{$self->{categories}}) { 109 $self->{stats}->{perfect}->{$cat}=0; 110 $self->{stats}->{close}->{$cat}=0; 111 } 112 $self->{stats}->{perfectMatches}=0; 113 $self->{stats}->{closeMatches}=0; 114 115 $self->{stats}->{startTime}=time(); 116 117 return($self); 118} 119 120sub loadDBInfo($) 121{ 122 my $file=shift; 123 my $info; 124 125 open(INFO, "< $file") || return("imdbDir index file \"$file\":$!\n"); 126 while(<INFO>) { 127 chop(); 128 if ( s/^([^:]+)://o ) { 129 $info->{$1}=$_; 130 } 131 } 132 close(INFO); 133 return($info); 134} 135 136sub checkIndexesOkay($) 137{ 138 my $self=shift; 139 if ( ! -d "$self->{imdbDir}" ) { 140 return("imdbDir \"$self->{imdbDir}\" does not exist\n"); 141 } 142 143 if ( -f "$self->{moviedbOffline}" ) { 144 return("imdbDir index offline: check $self->{moviedbOffline} for details"); 145 } 146 147 for my $file ($self->{moviedbIndex}, $self->{moviedbData}, $self->{moviedbInfo}) { 148 if ( ! -f "$file" ) { 149 return("imdbDir index file \"$file\" does not exist\n"); 150 } 151 } 152 153 $VERSION=~m/^(\d+)\.(\d+)$/o || die "package corrupt, VERSION string invalid ($VERSION)"; 154 my ($major, $minor)=($1, $2); 155 156 my $info=loadDBInfo($self->{moviedbInfo}); 157 return($info) if ( ref $info eq 'SCALAR' ); 158 159 if ( !defined($info->{db_version}) ) { 160 return("imdbDir index db missing version information, rerun --prepStage all\n"); 161 } 162 if ( $info->{db_version}=~m/^(\d+)\.(\d+)$/o ) { 163 if ( $1 != $major || $2 < $minor ) { 164 return("imdbDir index db requires updating, rerun --prepStage all\n"); 165 } 166 if ( $1 == 0 && $2 == 1 ) { 167 return("imdbDir index db requires update, rerun --prepStage 5 (bug:actresses never appear)\n"); 168 } 169 if ( $1 == 0 && $2 == 2 ) { 170 # 0.2 -> 0.3 upgrade requires prepStage 5 to be re-run 171 return("imdbDir index db requires minor reindexing, rerun --prepStage 3 and 5\n"); 172 } 173 if ( $1 == 0 && $2 == 3 ) { 174 # 0.2 -> 0.3 upgrade requires prepStage 5 to be re-run 175 return("imdbDir index db requires major reindexing, rerun --prepStage 2 and new prepStages 5,6,7,8 and 9\n"); 176 } 177 if ( $1 == 0 && $2 == 4 ) { 178 # 0.2 -> 0.3 upgrade requires prepStage 5 to be re-run 179 return("imdbDir index db corrupt (got version 0.4), rerun --prepStage all\n"); 180 } 181 # okay 182 return(undef); 183 } 184 else { 185 return("imdbDir index version of '$info->{db_version}' is invalid, rerun --prepStage all\n". 186 "if problem persists, submit bug report to xmltv-devel\@lists.sf.net\n"); 187 } 188} 189 190sub basicVerificationOfIndexes($) 191{ 192 my $self=shift; 193 194 # check that the imdbdir is invalid and up and running 195 my $title="Army of Darkness"; 196 my $year=1992; 197 198 $self->openMovieIndex() || return("basic verification of indexes failed\n". 199 "database index isn't readable"); 200 201 my $verbose = $self->{verbose}; $self->{verbose} = 0; 202 my $res=$self->getMovieMatches($title, $year); 203 $self->{verbose} = $verbose; undef $verbose; 204 if ( !defined($res) ) { 205 $self->closeMovieIndex(); 206 return("basic verification of indexes failed\n". 207 "no match for basic verification of movie \"$title, $year\"\n"); 208 } 209 if ( !defined($res->{exactMatch}) ) { 210 $self->closeMovieIndex(); 211 return("basic verification of indexes failed\n". 212 "no exact match for movie \"$title, $year\"\n"); 213 } 214 if ( scalar(@{$res->{exactMatch}})!= 1) { 215 $self->closeMovieIndex(); 216 return("basic verification of indexes failed\n". 217 "got more than one exact match for movie \"$title, $year\"\n"); 218 } 219 my @exact=@{$res->{exactMatch}}; 220 if ( $exact[0]->{title} ne $title ) { 221 $self->closeMovieIndex(); 222 return("basic verification of indexes failed\n". 223 "title associated with key \"$title, $year\" is bad\n"); 224 } 225 226 if ( $exact[0]->{year} ne "$year" ) { 227 $self->closeMovieIndex(); 228 return("basic verification of indexes failed\n". 229 "year associated with key \"$title, $year\" is bad\n"); 230 } 231 232 my $id=$exact[0]->{id}; 233 $res=$self->getMovieIdDetails($id); 234 if ( !defined($res) ) { 235 $self->closeMovieIndex(); 236 return("basic verification of indexes failed\n". 237 "no movie details for movie \"$title, $year\" (id=$id)\n"); 238 } 239 240 if ( !defined($res->{directors}) ) { 241 $self->closeMovieIndex(); 242 return("basic verification of indexes failed\n". 243 "movie details didn't provide any director for movie \"$title, $year\" (id=$id)\n"); 244 } 245 if ( !$res->{directors}[0]=~m/Raimi/o ) { 246 $self->closeMovieIndex(); 247 return("basic verification of indexes failed\n". 248 "movie details didn't show Raimi as the main director for movie \"$title, $year\" (id=$id)\n"); 249 } 250 if ( !defined($res->{actors}) ) { 251 $self->closeMovieIndex(); 252 return("basic verification of indexes failed\n". 253 "movie details didn't provide any cast movie \"$title, $year\" (id=$id)\n"); 254 } 255 if ( !$res->{actors}[0]=~m/Campbell/o ) { 256 $self->closeMovieIndex(); 257 return("basic verification of indexes failed\n". 258 "movie details didn't show Bruce Campbell as the main actor in movie \"$title, $year\" (id=$id)\n"); 259 } 260 my $matches=0; 261 for (@{$res->{genres}}) { 262 if ( $_ eq "Action" || 263 $_ eq "Comedy" || 264 $_ eq "Fantasy" || 265 $_ eq "Horror" || 266 $_ eq "Romance" ) { 267 $matches++; 268 } 269 } 270 if ( $matches == 0 ) { 271 $self->closeMovieIndex(); 272 return("basic verification of indexes failed\n". 273 "movie details didn't show genres correctly for movie \"$title, $year\" (id=$id)\n"); 274 } 275 if ( !defined($res->{ratingDist}) || 276 !defined($res->{ratingVotes}) || 277 !defined($res->{ratingRank}) ) { 278 $self->closeMovieIndex(); 279 return("basic verification of indexes failed\n". 280 "movie details didn't show imdbratings for movie \"$title, $year\" (id=$id)\n"); 281 } 282 $self->closeMovieIndex(); 283 return(undef); 284 285} 286 287sub sanityCheckDatabase($) 288{ 289 my $self=shift; 290 my $errline; 291 292 $errline=$self->checkIndexesOkay(); 293 return($errline) if ( defined($errline) ); 294 $errline=$self->basicVerificationOfIndexes(); 295 return($errline) if ( defined($errline) ); 296 297 # all okay 298 return(undef); 299} 300 301sub error($$) 302{ 303 print STDERR "tv_imdb: $_[1]\n"; 304} 305 306sub status($$) 307{ 308 if ( $_[0]->{verbose} ) { 309 print STDERR "tv_imdb: $_[1]\n"; 310 } 311} 312 313sub debug($$) 314{ 315 my $self=shift; 316 my $mess=shift; 317 if ( $self->{verbose} > 1 ) { 318 print STDERR "tv_imdb: $mess\n"; 319 } 320} 321 322use Search::Dict; 323 324sub openMovieIndex($) 325{ 326 my $self=shift; 327 328 if ( !open($self->{INDEX_FD}, "< $self->{moviedbIndex}") ) { 329 return(undef); 330 } 331 if ( !open($self->{DBASE_FD}, "< $self->{moviedbData}") ) { 332 close($self->{INDEX_FD}); 333 return(undef); 334 } 335 return(1); 336} 337 338sub closeMovieIndex($) 339{ 340 my $self=shift; 341 342 close($self->{INDEX_FD}); 343 delete($self->{INDEX_FD}); 344 345 close($self->{DBASE_FD}); 346 delete($self->{DBASE_FD}); 347 348 return(1); 349} 350 351# moviedbIndex file has the format: 352# title:lineno 353# where key is a url encoded title followed by the year of production and a colon 354sub getMovieMatches($$$) 355{ 356 my $self=shift; 357 my $title=shift; 358 my $year=shift; 359 360 # Articles are put at the end of a title ( in all languages ) 361 #$match=~s/^(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)\s+(.*)$/$2, $1/og; 362 363 my $match="$title"; 364 if ( defined($year) ) { 365 $match.=" ($year)"; 366 } 367 368 # to encode s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/oeg 369 # to decode s/%(?:([0-9a-fA-F]{2})|u([0-9a-fA-F]{4}))/defined($1)? chr hex($1) : utf8_chr(hex($2))/oge; 370 371 # url encode 372 $match=lc($match); 373 $match=~s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/oeg; 374 375 $self->debug("looking for \"$match\" in $self->{moviedbIndex}"); 376 if ( !$self->{INDEX_FD} ) { 377 die "internal error: index not open"; 378 } 379 380 my $FD=$self->{INDEX_FD}; 381 Search::Dict::look(*{$FD}, $match, 0, 0); 382 my $results; 383 while (<$FD>) { 384 last if ( !m/^$match/ ); 385 386 chop(); 387 my @arr=split('\t', $_); 388 if ( scalar(@arr) != 5 ) { 389 warn "$self->{moviedbIndex} corrupt (correct key:$_)"; 390 next; 391 } 392 393 if ( $arr[0] eq $match ) { 394 # return title and id 395 #$arr[1]=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og; 396 397 #$arr[0]=~s/%(?:([0-9a-fA-F]{2})|u([0-9a-fA-F]{4}))/defined($1)? chr hex($1) : utf8_chr(hex($2))/oge; 398 #$self->debug("exact:$arr[1] ($arr[2]) qualifier=$arr[3] id=$arr[4]"); 399 my $title=$arr[1]; 400 if ( $title=~s/\s+\((\d\d\d\d|\?\?\?\?)\)$//o ) { 401 } 402 elsif ( $title=~s/\s+\((\d\d\d\d|\?\?\?\?)\/[IVX]+\)$//o ) { 403 } 404 else { 405 die "unable to decode year from title key \"$title\", report to xmltv-devel\@lists.sf.net"; 406 } 407 $title=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og; 408 $self->debug("exact:$title ($arr[2]) qualifier=$arr[3] id=$arr[4]"); 409 push(@{$results->{exactMatch}}, {'key'=> $arr[1], 410 'title'=>$title, 411 'year'=>$arr[2], 412 'qualifier'=>$arr[3], 413 'id'=>$arr[4]}); 414 } 415 else { 416 # decode 417 #s/%(?:([0-9a-fA-F]{2})|u([0-9a-fA-F]{4}))/defined($1)? chr hex($1) : utf8_chr(hex($2))/oge; 418 # return title 419 #$arr[1]=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og; 420 #$arr[0]=~s/%(?:([0-9a-fA-F]{2})|u([0-9a-fA-F]{4}))/defined($1)? chr hex($1) : utf8_chr(hex($2))/oge; 421 #$self->debug("close:$arr[1] ($arr[2]) qualifier=$arr[3] id=$arr[4]"); 422 my $title=$arr[1]; 423 424 if ( $title=~m/^\"/o && $title=~m/\"\s*\(/o ) { #" 425 $title=~s/^\"//o; #" 426 $title=~s/\"(\s*\()/$1/o; #" 427 } 428 429 if ( $title=~s/\s+\((\d\d\d\d|\?\?\?\?)\)$//o ) { 430 } 431 elsif ( $title=~s/\s+\((\d\d\d\d|\?\?\?\?)\/[IVX]+\)$//o ) { 432 } 433 else { 434 die "unable to decode year from title key \"$title\", report to xmltv-devel\@lists.sf.net"; 435 } 436 $title=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og; 437 $self->debug("close:$title ($arr[2]) qualifier=$arr[3] id=$arr[4]"); 438 push(@{$results->{closeMatch}}, {'key'=> $arr[1], 439 'title'=>$title, 440 'year'=>$arr[2], 441 'qualifier'=>$arr[3], 442 'id'=>$arr[4]}); 443 } 444 } 445 #print "MovieMatches on ($match) = ".Dumper($results)."\n"; 446 return($results); 447} 448 449sub getMovieExactMatch($$$) 450{ 451 my $self=shift; 452 my $title=shift; 453 my $year=shift; 454 my $res=$self->getMovieMatches($title, $year); 455 456 return(undef) if ( !defined($res) ); 457 if ( !defined($res->{exactMatch}) ) { 458 return(undef); 459 } 460 if ( scalar(@{$res->{exactMatch}}) != 1 ) { 461 return(undef); 462 } 463 return($res->{exactMatch}[0]); 464} 465 466sub getMovieCloseMatches($$) 467{ 468 my $self=shift; 469 my $title=shift; 470 471 my $res=$self->getMovieMatches($title, undef) || return(undef); 472 473 if ( defined($res->{exactMatch})) { 474 die "corrupt imdb database - hit on \"$title\""; 475 } 476 return(undef) if ( !defined($res->{closeMatch}) ); 477 my @arr=@{$res->{closeMatch}}; 478 #print "CLOSE DUMP=".Dumper(@arr)."\n"; 479 return(@arr); 480} 481 482sub getMovieIdDetails($$) 483{ 484 my $self=shift; 485 my $id=shift; 486 487 if ( !$self->{DBASE_FD} ) { 488 die "internal error: index not open"; 489 } 490 my $results; 491 my $FD=$self->{DBASE_FD}; 492 Search::Dict::look(*{$FD}, "$id:", 0, 0); 493 while (<$FD>) { 494 last if ( !m/^$id:/ ); 495 chop(); 496 if ( s/^$id:// ) { 497 my ($directors, $actors, $genres, $ratingDist, $ratingVotes, $ratingRank, $keywords, $plot)=split('\t', $_); 498 if ( $directors ne "<>" ) { 499 for my $name (split('\|', $directors)) { 500 # remove (I) etc from imdb.com names (kept in place for reference) 501 $name=~s/\s\([IVX]+\)$//o; 502 # switch name around to be surname last 503 $name=~s/^([^,]+),\s*(.*)$/$2 $1/o; 504 push(@{$results->{directors}}, $name); 505 } 506 } 507 if ( $actors ne "<>" ) { 508 for my $name (split('\|', $actors)) { 509 # remove (I) etc from imdb.com names (kept in place for reference) 510 my $HostNarrator; 511 if ( $name=~s/\[([^\]]+)\]$//o ) { 512 $HostNarrator=$1; 513 } 514 $name=~s/\s\([IVX]+\)$//o; 515 516 # switch name around to be surname last 517 $name=~s/^([^,]+),\s*(.*)$/$2 $1/o; 518 if ( $HostNarrator ) { 519 if ( $HostNarrator=~s/,*Host//o ) { 520 push(@{$results->{presenter}}, $name); 521 } 522 if ( $HostNarrator=~s/,*Narrator//o ) { 523 push(@{$results->{commentator}}, $name); 524 } 525 } 526 else { 527 push(@{$results->{actors}}, $name); 528 } 529 } 530 } 531 if ( $genres ne "<>" ) { 532 push(@{$results->{genres}}, split('\|', $genres)); 533 } 534 if ( $keywords ne "<>" ) { 535 push(@{$results->{keywords}}, split(',', $keywords)); 536 } 537 $results->{ratingDist}=$ratingDist if ( $ratingDist ne "<>" ); 538 $results->{ratingVotes}=$ratingVotes if ( $ratingVotes ne "<>" ); 539 $results->{ratingRank}=$ratingRank if ( $ratingRank ne "<>" ); 540 $results->{plot}=$plot if ( $plot ne "<>" ); 541 } 542 else { 543 warn "lookup of movie (id=$id) resulted in garbage ($_)"; 544 } 545 } 546 if ( !defined($results) ) { 547 # some movies we don't have any details for 548 $results->{noDetails}=1; 549 } 550 #print "MovieDetails($id) = ".Dumper($results)."\n"; 551 return($results); 552} 553 554# 555# FUTURE - close hit could be just missing or extra 556# punctuation: 557# "Run Silent, Run Deep" for imdb's "Run Silent Run Deep" 558# "Cherry, Harry and Raquel" for imdb's "Cherry, Harry and Raquel!" 559# "Cat Women of the Moon" for imdb's "Cat-Women of the Moon" 560# "Baywatch Hawaiian Wedding" for imdb's "Baywatch: Hawaiian Wedding" :) 561# 562# FIXED - "Victoria and Albert" appears for imdb's "Victoria & Albert" (and -> &) 563# FIXED - "Columbo Cries Wolf" appears instead of "Columbo:Columbo Cries Wolf" 564# FIXED - Place the article last, for multiple languages. For instance 565# Los amantes del c�rculo polar -> amantes del c�rculo polar, Los 566# FIXED - common international vowel changes. For instance 567# "Anna Kar�nin" (�->e) 568# 569sub alternativeTitles($) 570{ 571 my $title=shift; 572 my @titles; 573 574 push(@titles, $title); 575 576 # try the & -> and conversion 577 if ( $title=~m/\&/o ) { 578 my $t=$title; 579 while ( $t=~s/(\s)\&(\s)/$1and$2/o ) { 580 push(@titles, $t); 581 } 582 } 583 584 # try the and -> & conversion 585 if ( $title=~m/\sand\s/io ) { 586 my $t=$title; 587 while ( $t=~s/(\s)and(\s)/$1\&$2/io ) { 588 push(@titles, $t); 589 } 590 } 591 592 # try the "Columbo: Columbo cries Wolf" -> "Columbo cries Wolf" conversion 593 my $max=scalar(@titles); 594 for (my $i=0; $i<$max ; $i++) { 595 my $t=$titles[$i]; 596 if ( $t=~m/^[^:]+:.+$/io ) { 597 while ( $t=~s/^[^:]+:\s*(.+)\s*$/$1/io ) { 598 push(@titles, $t); 599 } 600 } 601 } 602 603 # Place the articles last 604 $max=scalar(@titles); 605 for (my $i=0; $i<$max ; $i++) { 606 my $t=$titles[$i]; 607 if ( $t=~m/^(The|A|Une|Les|Los|Las|L\'|Le|La|El|Das|De|Het|Een)\s+(.*)$/io ) { 608 $t=~s/^(The|A|Une|Les|Los|Las|L\'|Le|La|El|Das|De|Het|Een)\s+(.*)$/$2, $1/iog; 609 push(@titles, $t); 610 } 611 if ( $t=~m/^(.+),\s*(The|A|Une|Les|Los|Las|L\'|Le|La|El|Das|De|Het|Een)$/io ) { 612 $t=~s/^(.+),\s*(The|A|Une|Les|Los|Las|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/iog; 613 push(@titles, $t); 614 } 615 } 616 617 # convert all the special language characters 618 $max=scalar(@titles); 619 for (my $i=0; $i<$max ; $i++) { 620 my $t=$titles[$i]; 621 if ( $t=~m/[����������������������������������������������������������]/io ) { 622 $t=~s/[������������]/a/gio; 623 $t=~s/[��������]/e/gio; 624 $t=~s/[��������]/i/gio; 625 $t=~s/[������������]/o/gio; 626 $t=~s/[��������]/u/gio; 627 $t=~s/[��]/ae/gio; 628 $t=~s/[��]/c/gio; 629 $t=~s/[��]/n/gio; 630 $t=~s/[�]/ss/gio; 631 $t=~s/[���]/y/gio; 632 $t=~s/[�]//gio; 633 push(@titles, $t); 634 } 635 } 636 637 # optional later possible titles include removing the '.' from titles 638 # ie "Project V.I.P.E.R." matching imdb "Project VIPER" 639 $max=scalar(@titles); 640 for (my $i=0; $i<$max ; $i++) { 641 my $t=$titles[$i]; 642 if ( $t=~s/\.//go ) { 643 push(@titles,$t); 644 } 645 } 646 return(\@titles); 647} 648 649sub findMovieInfo($$$$) 650{ 651 my ($self, $title, $year, $exact)=@_; 652 653 my @titles=@{alternativeTitles($title)}; 654 655 if ( $exact == 1 ) { 656 # try an exact match first :) 657 for my $mytitle ( @titles ) { 658 my $info=$self->getMovieExactMatch($mytitle, $year); 659 if ( defined($info) ) { 660 if ( $info->{qualifier} eq "movie" ) { 661 $self->status("perfect hit on movie \"$info->{key}\""); 662 $info->{matchLevel}="perfect"; 663 return($info); 664 } 665 elsif ( $info->{qualifier} eq "tv_movie" ) { 666 $self->status("perfect hit on made-for-tv-movie \"$info->{key}\""); 667 $info->{matchLevel}="perfect"; 668 return($info); 669 } 670 elsif ( $info->{qualifier} eq "video_movie" ) { 671 $self->status("perfect hit on made-for-video-movie \"$info->{key}\""); 672 $info->{matchLevel}="perfect"; 673 return($info); 674 } 675 elsif ( $info->{qualifier} eq "video_game" ) { 676 next; 677 } 678 elsif ( $info->{qualifier} eq "tv_series" ) { 679 } 680 elsif ( $info->{qualifier} eq "tv_mini_series" ) { 681 } 682 else { 683 $self->error("$self->{moviedbIndex} responded with wierd entry for \"$info->{key}\""); 684 $self->error("weird trailing qualifier \"$info->{qualifier}\""); 685 $self->error("submit bug report to xmltv-devel\@lists.sf.net"); 686 } 687 } 688 $self->debug("no exact title/year hit on \"$mytitle ($year)\""); 689 } 690 return(undef); 691 } 692 elsif ( $exact == 2 ) { 693 # looking for first exact match on the title, don't have a year to compare 694 695 for my $mytitle ( @titles ) { 696 # try close hit if only one :) 697 my $cnt=0; 698 my @closeMatches=$self->getMovieCloseMatches("$mytitle"); 699 700 # we traverse the hits twice, first looking for success, 701 # then again to produce warnings about missed close matches 702 for my $info (@closeMatches) { 703 next if ( !defined($info) ); 704 $cnt++; 705 706 # within one year with exact match good enough 707 if ( lc($mytitle) eq lc($info->{title}) ) { 708 709 if ( $info->{qualifier} eq "movie" ) { 710 $self->status("close enough hit on movie \"$info->{key}\" (since no 'date' field present)"); 711 $info->{matchLevel}="close"; 712 return($info); 713 } 714 elsif ( $info->{qualifier} eq "tv_movie" ) { 715 $self->status("close enough hit on made-for-tv-movie \"$info->{key}\" (since no 'date' field present)"); 716 $info->{matchLevel}="close"; 717 return($info); 718 } 719 elsif ( $info->{qualifier} eq "video_movie" ) { 720 $self->status("close enough hit on made-for-video-movie \"$info->{key}\" (since no 'date' field present)"); 721 $info->{matchLevel}="close"; 722 return($info); 723 } 724 elsif ( $info->{qualifier} eq "video_game" ) { 725 next; 726 } 727 elsif ( $info->{qualifier} eq "tv_series" ) { 728 } 729 elsif ( $info->{qualifier} eq "tv_mini_series" ) { 730 } 731 else { 732 $self->error("$self->{moviedbIndex} responded with wierd entry for \"$info->{key}\""); 733 $self->error("weird trailing qualifier \"$info->{qualifier}\""); 734 $self->error("submit bug report to xmltv-devel\@lists.sf.net"); 735 } 736 } 737 } 738 } 739 # nothing worked 740 return(undef); 741 } 742 743 # otherwise we're looking for a title match with a close year 744 for my $mytitle ( @titles ) { 745 # try close hit if only one :) 746 my $cnt=0; 747 my @closeMatches=$self->getMovieCloseMatches("$mytitle"); 748 749 # we traverse the hits twice, first looking for success, 750 # then again to produce warnings about missed close matches 751 for my $info (@closeMatches) { 752 next if ( !defined($info) ); 753 $cnt++; 754 755 # within one year with exact match good enough 756 if ( lc($mytitle) eq lc($info->{title}) ) { 757 my $yearsOff=abs(int($info->{year})-$year); 758 759 $info->{matchLevel}="close"; 760 761 if ( $yearsOff <= 2 ) { 762 my $showYear=int($info->{year}); 763 764 if ( $info->{qualifier} eq "movie" ) { 765 $self->status("close enough hit on movie \"$info->{key}\" (off by $yearsOff years)"); 766 return($info); 767 } 768 elsif ( $info->{qualifier} eq "tv_movie" ) { 769 $self->status("close enough hit on made-for-tv-movie \"$info->{key}\" (off by $yearsOff years)"); 770 return($info); 771 } 772 elsif ( $info->{qualifier} eq "video_movie" ) { 773 $self->status("close enough hit on made-for-video-movie \"$info->{key}\" (off by $yearsOff years)"); 774 return($info); 775 } 776 elsif ( $info->{qualifier} eq "video_game" ) { 777 $self->status("ignoring close hit on video-game \"$info->{key}\""); 778 next; 779 } 780 elsif ( $info->{qualifier} eq "tv_series" ) { 781 $self->status("ignoring close hit on tv series \"$info->{key}\""); 782 #$self->status("close enough hit on tv series \"$info->{key}\" (off by $yearsOff years)"); 783 } 784 elsif ( $info->{qualifier} eq "tv_mini_series" ) { 785 $self->status("ignoring close hit on tv mini-series \"$info->{key}\""); 786 #$self->status("close enough hit on tv mini-series \"$info->{key}\" (off by $yearsOff years)"); 787 } 788 else { 789 $self->error("$self->{moviedbIndex} responded with wierd entry for \"$info->{key}\""); 790 $self->error("weird trailing qualifier \"$info->{qualifier}\""); 791 $self->error("submit bug report to xmltv-devel\@lists.sf.net"); 792 } 793 } 794 } 795 } 796 797 # if we found at least something, but nothing matched 798 # produce warnings about missed, but close matches 799 for my $info (@closeMatches) { 800 next if ( !defined($info) ); 801 802 # within one year with exact match good enough 803 if ( lc($mytitle) eq lc($info->{title}) ) { 804 my $yearsOff=abs(int($info->{year})-$year); 805 if ( $yearsOff <= 2 ) { 806 #die "internal error: key \"$info->{key}\" failed to be processed properly"; 807 } 808 elsif ( $yearsOff <= 5 ) { 809 # report these as status 810 $self->status("ignoring close, but not good enough hit on \"$info->{key}\" (off by $yearsOff years)"); 811 } 812 else { 813 # report these as debug messages 814 $self->debug("ignoring close hit on \"$info->{key}\" (off by $yearsOff years)"); 815 } 816 } 817 else { 818 $self->debug("ignoring close hit on \"$info->{key}\" (title did not match)"); 819 } 820 } 821 } 822 #$self->status("failed to lookup \"$title ($year)\""); 823 return(undef); 824} 825 826sub findTVSeriesInfo($$) 827{ 828 my ($self, $title)=@_; 829 830 if ( $self->{cacheLookups} ) { 831 my $id=$self->{cachedLookups}->{tv_series}->{$title}; 832 833 if ( defined($id) ) { 834 #print STDERR "REF= (".ref($id).")\n"; 835 if ( $id ne '' ) { 836 return($id); 837 } 838 return(undef); 839 } 840 } 841 842 my @titles=@{alternativeTitles($title)}; 843 844 # try an exact match first :) 845 my $idInfo; 846 847 for my $mytitle ( @titles ) { 848 # try close hit if only one :) 849 my $cnt=0; 850 my @closeMatches=$self->getMovieCloseMatches("$mytitle"); 851 852 for my $info (@closeMatches) { 853 next if ( !defined($info) ); 854 $cnt++; 855 856 if ( lc($mytitle) eq lc($info->{title}) ) { 857 858 $info->{matchLevel}="perfect"; 859 860 if ( $info->{qualifier} eq "movie" ) { 861 #$self->status("ignoring close hit on movie \"$info->{key}\""); 862 } 863 elsif ( $info->{qualifier} eq "tv_movie" ) { 864 #$self->status("ignoring close hit on tv movie \"$info->{key}\""); 865 } 866 elsif ( $info->{qualifier} eq "video_movie" ) { 867 #$self->status("ignoring close hit on made-for-video-movie \"$info->{key}\""); 868 } 869 elsif ( $info->{qualifier} eq "video_game" ) { 870 #$self->status("ignoring close hit on made-for-video-movie \"$info->{key}\""); 871 next; 872 } 873 elsif ( $info->{qualifier} eq "tv_series" ) { 874 $idInfo=$info; 875 $self->status("perfect hit on tv series \"$info->{key}\""); 876 last; 877 } 878 elsif ( $info->{qualifier} eq "tv_mini_series" ) { 879 $idInfo=$info; 880 $self->status("perfect hit on tv mini-series \"$info->{key}\""); 881 last; 882 } 883 else { 884 $self->error("$self->{moviedbIndex} responded with wierd entry for \"$info->{key}\""); 885 $self->error("weird trailing qualifier \"$info->{qualifier}\""); 886 $self->error("submit bug report to xmltv-devel\@lists.sf.net"); 887 } 888 } 889 } 890 last if ( defined($idInfo) ); 891 } 892 893 if ( $self->{cacheLookups} ) { 894 # flush cache after this lookup if its gotten too big 895 if ( $self->{cachedLookups}->{tv_series}->{_cacheSize_} > 896 $self->{cacheLookupSize} ) { 897 delete($self->{cachedLookups}->{tv_series}); 898 $self->{cachedLookups}->{tv_series}->{_cacheSize_}=0; 899 } 900 if ( defined($idInfo) ) { 901 $self->{cachedLookups}->{tv_series}->{$title}=$idInfo; 902 } 903 else { 904 $self->{cachedLookups}->{tv_series}->{$title}=""; 905 } 906 $self->{cachedLookups}->{tv_series}->{_cacheSize_}++; 907 } 908 if ( defined($idInfo) ) { 909 return($idInfo); 910 } 911 else { 912 #$self->status("failed to lookup tv series \"$title\""); 913 return(undef); 914 } 915} 916 917# 918# todo - add country of origin 919# todo - video (colour/aspect etc) details 920# todo - audio (stereo) details 921# todo - ratings ? - use certificates.list 922# todo - add description - plot summaries ? - which one do we choose ? 923# todo - writer 924# todo - producer 925# todo - running time (duration) 926# todo - identify 'Host' and 'Narrator's and put them in as 927# credits:presenter and credits:commentator resp. 928# todo - check program length - probably a warning if longer ? 929# can we update length (separate from runnning time in the output ?) 930# todo - icon - url from www.imdb.com of programme image ? 931# this could be done by scraping for the hyper linked poster 932# <a name="poster"><img src="http://ia.imdb.com/media/imdb/01/I/60/69/80m.jpg" height="139" width="99" border="0"></a> 933# and grabbin' out the img entry. (BTW ..../npa.jpg seems to line up with no poster available) 934# 935# 936sub applyFound($$$) 937{ 938 my ($self, $prog, $idInfo)=@_; 939 940 my $title=$prog->{title}->[0]->[0]; 941 942 if ( $self->{updateDates} ) { 943 my $date; 944 945 # don't add dates only fix them for tv_series 946 if ( $idInfo->{qualifier} eq "movie" || 947 $idInfo->{qualifier} eq "video_movie" || 948 $idInfo->{qualifier} eq "tv_movie" ) { 949 #$self->debug("adding 'date' field (\"$idInfo->{year}\") on \"$title\""); 950 $date=int($idInfo->{year}); 951 } 952 else { 953 #$self->debug("not adding 'date' field to $idInfo->{qualifier} \"$title\""); 954 $date=undef; 955 } 956 957 if ( $self->{replaceDates} ) { 958 if ( defined($prog->{date}) && defined($date) ) { 959 $self->debug("replacing 'date' field"); 960 delete($prog->{date}); 961 $prog->{date}=$date; 962 } 963 } 964 else { 965 # only set date if not already defined 966 if ( !defined($prog->{date}) && defined($date) ) { 967 $prog->{date}=$date; 968 } 969 } 970 } 971 972 if ( $self->{updateTitles} ) { 973 if ( $idInfo->{title} ne $title ) { 974 if ( $self->{replaceTitles} ) { 975 $self->debug("replacing (all) 'title' from \"$title\" to \"$idInfo->{title}\""); 976 delete($prog->{title}); 977 } 978 979 my @list; 980 981 push(@list, [$idInfo->{title}, undef]); 982 983 if ( defined($prog->{title}) ) { 984 my $name=$idInfo->{title}; 985 my $found=0; 986 for my $v (@{$prog->{title}}) { 987 if ( lc($v->[0]) eq lc($name) ) { 988 $found=1; 989 } 990 else { 991 push(@list, $v); 992 } 993 } 994 } 995 $prog->{title}=\@list; 996 } 997 } 998 999 if ( $self->{updateURLs} ) { 1000 if ( $self->{replaceURLs} ) { 1001 if ( defined($prog->{url}) ) { 1002 $self->debug("replacing (all) 'url'"); 1003 delete($prog->{url}); 1004 } 1005 } 1006 1007 # add url to programme on www.imdb.com 1008 my $url=$idInfo->{key}; 1009 1010 $url=~s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/oeg; 1011 $url="http://us.imdb.com/M/title-exact?".$url; 1012 1013 if ( defined($prog->{url}) ) { 1014 my @rep; 1015 push(@rep, $url); 1016 for (@{$prog->{url}}) { 1017 # skip urls for imdb.com that we're probably safe to replace 1018 if ( !m;^http://us.imdb.com/M/title-exact;o ) { 1019 push(@rep, $_); 1020 } 1021 } 1022 $prog->{url}=\@rep; 1023 } 1024 else { 1025 push(@{$prog->{url}}, $url); 1026 } 1027 } 1028 1029 # squirrel away movie qualifier so its first on the list of replacements 1030 my @categories; 1031 push(@categories, [$self->{categories}->{$idInfo->{qualifier}}, 'en']); 1032 if ( !defined($self->{categories}->{$idInfo->{qualifier}}) ) { 1033 die "how did we get here with an invalid qualifier '$idInfo->{qualifier}'"; 1034 } 1035 1036 my $details=$self->getMovieIdDetails($idInfo->{id}); 1037 if ( $details->{noDetails} ) { 1038 # we don't have any details on this movie 1039 } 1040 else { 1041 # add directors list 1042 if ( $self->{updateDirectors} && defined($details->{directors}) ) { 1043 # only update directors if we have exactly one or if 1044 # its a movie of some kind, add more than one. 1045 if ( scalar(@{$details->{directors}}) == 1 || 1046 $idInfo->{qualifier} eq "movie" || 1047 $idInfo->{qualifier} eq "video_movie" || 1048 $idInfo->{qualifier} eq "tv_movie" ) { 1049 1050 if ( $self->{replaceDirectors} ) { 1051 if ( defined($prog->{credits}->{director}) ) { 1052 $self->debug("replacing director(s)"); 1053 delete($prog->{credits}->{director}); 1054 } 1055 } 1056 1057 my @list; 1058 # add top 3 billing directors list form www.imdb.com 1059 for my $name (splice(@{$details->{directors}},0,3)) { 1060 push(@list, $name); 1061 } 1062 1063 # preserve all existing directors listed if we did't already have them. 1064 if ( defined($prog->{credits}->{director}) ) { 1065 for my $name (@{$prog->{credits}->{director}}) { 1066 my $found=0; 1067 for(@list) { 1068 if ( lc eq lc($name) ) { 1069 $found=1; 1070 } 1071 } 1072 if ( !$found ) { 1073 push(@list, $name); 1074 } 1075 } 1076 } 1077 $prog->{credits}->{director}=\@list; 1078 } 1079 else { 1080 $self->debug("not adding 'director' field to $idInfo->{qualifier} \"$title\""); 1081 } 1082 } 1083 1084 if ( $self->{updateActors} && defined($details->{actors}) ) { 1085 if ( $self->{replaceActors} ) { 1086 if ( defined($prog->{credits}->{actor}) ) { 1087 $self->debug("replacing actor(s) on $idInfo->{qualifier} \"$idInfo->{key}\""); 1088 delete($prog->{credits}->{actor}); 1089 } 1090 } 1091 1092 my @list; 1093 # add top billing actors (default = 3) from www.imdb.com 1094 for my $name (splice(@{$details->{actors}},0,$self->{numActors})) { 1095 push(@list, $name); 1096 } 1097 # preserve all existing actors listed if we did't already have them. 1098 if ( defined($prog->{credits}->{actor}) ) { 1099 for my $name (@{$prog->{credits}->{actor}}) { 1100 my $found=0; 1101 for(@list) { 1102 if ( lc eq lc($name) ) { 1103 $found=1; 1104 } 1105 } 1106 if ( !$found ) { 1107 push(@list, $name); 1108 } 1109 } 1110 } 1111 $prog->{credits}->{actor}=\@list; 1112 } 1113 1114 if ( $self->{updatePresentors} && defined($details->{presenter}) ) { 1115 if ( $self->{replacePresentors} ) { 1116 if ( defined($prog->{credits}->{presenter}) ) { 1117 $self->debug("replacing presentor"); 1118 delete($prog->{credits}->{presenter}); 1119 } 1120 } 1121 $prog->{credits}->{presenter}=$details->{presenter}; 1122 } 1123 if ( $self->{updateCommentators} && defined($details->{commentator}) ) { 1124 if ( $self->{replaceCommentators} ) { 1125 if ( defined($prog->{credits}->{commentator}) ) { 1126 $self->debug("replacing commentator"); 1127 delete($prog->{credits}->{commentator}); 1128 } 1129 } 1130 $prog->{credits}->{commentator}=$details->{commentator}; 1131 } 1132 1133 # push genres as categories 1134 if ( $self->{updateCategoriesWithGenres} ) { 1135 if ( defined($details->{genres}) ) { 1136 for (@{$details->{genres}}) { 1137 push(@categories, [$_, 'en']); 1138 } 1139 } 1140 } 1141 1142 if ( $self->{updateStarRatings} && defined($details->{ratingRank}) ) { 1143 if ( $self->{replaceStarRatings} ) { 1144 if ( defined($prog->{'star-rating'}) ) { 1145 $self->debug("replacing 'star-rating'"); 1146 delete($prog->{'star-rating'}); 1147 } 1148 unshift( @{$prog->{'star-rating'}}, [ $details->{ratingRank} . "/10", 'IMDB User Rating' ] ); 1149 } 1150 else { 1151 # add IMDB User Rating in front of all other star-ratings 1152 unshift( @{$prog->{'star-rating'}}, [ $details->{ratingRank} . "/10", 'IMDB User Rating' ] ); 1153 } 1154 } 1155 1156 if ( $self->{updateKeywords} ) { 1157 my @keywords; 1158 if ( defined($details->{keywords}) ) { 1159 for (@{$details->{keywords}}) { 1160 push(@keywords, [$_, 'en']); 1161 } 1162 } 1163 1164 if ( $self->{replaceKeywords} ) { 1165 if ( defined($prog->{keywords}) ) { 1166 $self->debug("replacing (all) 'keywords'"); 1167 delete($prog->{keywords}); 1168 } 1169 } 1170 if ( defined($prog->{keyword}) ) { 1171 for my $value (@{$prog->{keyword}}) { 1172 my $found=0; 1173 for my $k (@keywords) { 1174 if ( lc($k->[0]) eq lc($value->[0]) ) { 1175 $found=1; 1176 } 1177 } 1178 if ( !$found ) { 1179 push(@keywords, $value); 1180 } 1181 } 1182 } 1183 $prog->{keyword}=\@keywords; 1184 } 1185 1186 if ( $self->{updatePlot} ) { 1187 # plot is held as a <desc> entity 1188 # if 'replacePlot' then delete all existing <desc> entities and add new 1189 # else add this plot as an additional <desc> entity 1190 # 1191 if ( $self->{replacePlot} ) { 1192 if ( defined($prog->{desc}) ) { 1193 $self->debug("replacing (all) 'desc'"); 1194 delete($prog->{desc}); 1195 } 1196 } 1197 if ( defined($details->{plot}) ) { 1198 # check it's not already there 1199 my $found = 0; 1200 for my $_desc ( @{$prog->{desc}} ) { 1201 $found = 1 if ( @{$_desc}[0] eq $details->{plot} ); 1202 } 1203 push @{$prog->{desc}}, [ $details->{plot}, 'en' ] if !$found; 1204 } 1205 } 1206 1207 } 1208 1209 if ( $self->{updateCategories} ) { 1210 if ( $self->{replaceCategories} ) { 1211 if ( defined($prog->{category}) ) { 1212 $self->debug("replacing (all) 'category'"); 1213 delete($prog->{category}); 1214 } 1215 } 1216 if ( defined($prog->{category}) ) { 1217 for my $value (@{$prog->{category}}) { 1218 my $found=0; 1219 #print "checking category $value->[0] with $mycategory\n"; 1220 for my $c (@categories) { 1221 if ( lc($c->[0]) eq lc($value->[0]) ) { 1222 $found=1; 1223 } 1224 } 1225 if ( !$found ) { 1226 push(@categories, $value); 1227 } 1228 } 1229 } 1230 $prog->{category}=\@categories; 1231 } 1232 1233 return($prog); 1234} 1235 1236sub augmentProgram($$$) 1237{ 1238 my ($self, $prog, $movies_only)=@_; 1239 1240 $self->{stats}->{programCount}++; 1241 1242 # assume first title in first language is the one we want. 1243 my $title=$prog->{title}->[0]->[0]; 1244 1245 if ( defined($prog->{date}) && $prog->{date}=~m/^\d\d\d\d$/o ) { 1246 1247 # for programs with dates we try: 1248 # - exact matches on movies 1249 # - exact matches on tv series 1250 # - close matches on movies 1251 my $id=$self->findMovieInfo($title, $prog->{date}, 1); # exact match 1252 if ( !defined($id) ) { 1253 $id=$self->findTVSeriesInfo($title); 1254 if ( !defined($id) ) { 1255 $id=$self->findMovieInfo($title, $prog->{date}, 0); # close match 1256 } 1257 } 1258 if ( defined($id) ) { 1259 $self->{stats}->{$id->{matchLevel}."Matches"}++; 1260 $self->{stats}->{$id->{matchLevel}}->{$id->{qualifier}}++; 1261 return($self->applyFound($prog, $id)); 1262 } 1263 $self->status("failed to find a match for movie \"$title ($prog->{date})\""); 1264 return(undef); 1265 # fall through and try again as a tv series 1266 } 1267 1268 if ( !$movies_only ) { 1269 my $id=$self->findTVSeriesInfo($title); 1270 if ( defined($id) ) { 1271 $self->{stats}->{$id->{matchLevel}."Matches"}++; 1272 $self->{stats}->{$id->{matchLevel}}->{$id->{qualifier}}++; 1273 return($self->applyFound($prog, $id)); 1274 } 1275 1276 if ( 0 ) { 1277 # this has hard to support 'close' results, unless we know 1278 # for certain we're looking for a movie (ie duration etc) 1279 # this is a bad idea. 1280 my $id=$self->findMovieInfo($title, undef, 2); # any title match 1281 if ( defined($id) ) { 1282 $self->{stats}->{$id->{matchLevel}."Matches"}++; 1283 $self->{stats}->{$id->{matchLevel}}->{$id->{qualifier}}++; 1284 return($self->applyFound($prog, $id)); 1285 } 1286 } 1287 $self->status("failed to find a match for show \"$title\""); 1288 } 1289 return(undef); 1290} 1291 1292# 1293# todo - add in stats on other things added (urls ?, actors, directors,categories) 1294# separate out from what was added or updated 1295# 1296sub getStatsLines($) 1297{ 1298 my $self=shift; 1299 my $totalChannelsParsed=shift; 1300 1301 my $endTime=time(); 1302 my %stats=%{$self->{stats}}; 1303 1304 my $ret=sprintf("Checked %d programs, on %d channels\n", $stats{programCount}, $totalChannelsParsed); 1305 1306 for my $cat (sort keys %{$self->{categories}}) { 1307 $ret.=sprintf(" found %d %s titles", $stats{perfect}->{$cat}+$stats{close}->{$cat}, 1308 $self->{categories}->{$cat}); 1309 if ( $stats{close}->{$cat} != 0 ) { 1310 if ( $stats{close}->{$cat} == 1 ) { 1311 $ret.=sprintf(" (%d was not perfect)", $stats{close}->{$cat}); 1312 } 1313 else { 1314 $ret.=sprintf(" (%d were not perfect)", $stats{close}->{$cat}); 1315 } 1316 } 1317 $ret.="\n"; 1318 } 1319 1320 $ret.=sprintf(" augmented %.2f%% of the programs, parsing %.2f programs/sec\n", 1321 ($stats{programCount}!=0)?(($stats{perfectMatches}+$stats{closeMatches})*100)/$stats{programCount}:0, 1322 ($endTime!=$stats{startTime} && $stats{programCount} != 0)? 1323 $stats{programCount}/($endTime-$stats{startTime}):0); 1324 1325 return($ret); 1326} 1327 13281; 1329 1330# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1331# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1332 1333package XMLTV::IMDB::Crunch; 1334use LWP; 1335 1336use open ':encoding(iso-8859-1)'; # try to enforce file encoding (does this work in Perl <5.8.1? ) 1337 1338# Use Term::ProgressBar if installed. 1339use constant Have_bar => eval { 1340 require Term::ProgressBar; 1341 $Term::ProgressBar::VERSION >= 2; 1342}; 1343 1344# 1345# This package parses and manages to index imdb plain text files from 1346# ftp.imdb.com/interfaces. (see http://www.imdb.com/interfaces for 1347# details) 1348# 1349# I might, given time build a download manager that: 1350# - downloads the latest plain text files 1351# - understands how to download each week's diffs and apply them 1352# Currently, the 'downloadMissingFiles' flag in the hash of attributes 1353# passed triggers a simple-minded downloader. 1354# 1355# I may also roll this project into a xmltv-free imdb-specific 1356# perl interface that just supports callbacks and understands more of 1357# the imdb file formats. 1358# 1359 1360sub new 1361{ 1362 my ($type) = shift; 1363 my $self={ @_ }; # remaining args become attributes 1364 for ($self->{downloadMissingFiles}) { 1365 $_=0 if not defined; # default 1366 } 1367 1368 for ('imdbDir', 'verbose') { 1369 die "invalid usage - no $_" if ( !defined($self->{$_})); 1370 } 1371 1372 $self->{stageLast} = 9; # set the final stage in the build - i.e. the one which builds the final database 1373 $self->{stages} = { 1=>'movies', 2=>'directors', 3=>'actors', 4=>'actresses', 5=>'genres', 6=>'ratings', 7=>'keywords', 8=>'plot' }; 1374 $self->{optionalStages} = { 'keywords' => 7, 'plot' => 8 }; # list of optional stages - no need to download files for these 1375 1376 $self->{moviedbIndex}="$self->{imdbDir}/moviedb.idx"; 1377 $self->{moviedbData}="$self->{imdbDir}/moviedb.dat"; 1378 $self->{moviedbInfo}="$self->{imdbDir}/moviedb.info"; 1379 $self->{moviedbOffline}="$self->{imdbDir}/moviedb.offline"; 1380 1381 # only leave progress bar on if its available 1382 if ( !Have_bar ) { 1383 $self->{showProgressBar}=0; 1384 } 1385 1386 bless($self, $type); 1387 1388 if ( $self->{stageToRun} ne $self->{stageLast} ) { 1389 # unless this is the last stage, check we have the necessary files 1390 return(undef) if ( $self->checkFiles() != 0 ); 1391 } 1392 1393 return($self); 1394} 1395 1396 1397sub checkFiles () { 1398 1399 my ($self)=@_; 1400 1401 if ( ! -d "$self->{imdbDir}" ) { 1402 if ( $self->{downloadMissingFiles} ) { 1403 warn "creating directory $self->{imdbDir}\n"; 1404 mkdir $self->{imdbDir}, 0777 1405 or die "cannot mkdir $self->{imdbDir}: $!"; 1406 } 1407 else { 1408 die "$self->{imdbDir}:does not exist"; 1409 } 1410 } 1411 my $listsDir = "$self->{imdbDir}/lists"; 1412 if ( ! -d $listsDir ) { 1413 mkdir $listsDir, 0777 or die "cannot mkdir $listsDir: $!"; 1414 } 1415 1416 CHECK_FILES: 1417 my %missingListFiles; # maps 'movies' to filename ...movies.gz 1418 1419 FILES_CHECK: 1420 while ( my( $key, $value ) = each %{ $self->{stages} } ) { 1421 # don't check *all* files - only the ones we are crunching 1422 next FILES_CHECK if ( lc($self->{stageToRun}) ne 'all' && $key != int($self->{stageToRun}) ); 1423 my $file=$value; 1424 my $filename="$listsDir/$file.list"; 1425 my $filenameGz="$filename.gz"; 1426 my $filenameExists = -f $filename; 1427 my $filenameSize = -s $filename; 1428 my $filenameGzExists = -f $filenameGz; 1429 my $filenameGzSize = -s $filenameGz; 1430 1431 if ( $filenameExists and not $filenameSize ) { 1432 warn "removing zero-length $filename\n"; 1433 unlink $filename or die "cannot unlink $filename: $!"; 1434 $filenameExists = 0; 1435 } 1436 if ( $filenameGzExists and not $filenameGzSize ) { 1437 warn "removing zero-length $filenameGz\n"; 1438 unlink $filenameGz or die "cannot unlink $filenameGz: $!"; 1439 $filenameGzExists = 0; 1440 } 1441 1442 if ( not $filenameExists and not $filenameGzExists ) { 1443 # Just report one of the filenames, keep the message simple. 1444 warn "$filenameGz does not exist\n"; 1445 if ( $self->{optionalStages}{$file} ) { 1446 warn "$file will not be added to database\n"; 1447 } else { 1448 $missingListFiles{$file}=$filenameGz; 1449 } 1450 } 1451 elsif ( not $filenameExists and $filenameGzExists ) { 1452 $self->{imdbListFiles}->{$file}=$filenameGz; 1453 } 1454 elsif ( $filenameExists and not $filenameGzExists ) { 1455 $self->{imdbListFiles}->{$file}=$filename; 1456 } 1457 elsif ( $filenameExists and $filenameGzExists ) { 1458 die "both $filename and $filenameGz exist, remove one of them\n"; 1459 } 1460 else { die } 1461 } 1462 if ( $self->{downloadMissingFiles} ) { 1463 my $baseUrl = 'ftp://ftp.fu-berlin.de/pub/misc/movies/database'; 1464 foreach ( sort keys %missingListFiles ) { 1465 my $url = "$baseUrl/$_.list.gz"; 1466 my $filename = delete $missingListFiles{$_}; 1467 my $partial = "$filename.partial"; 1468 if (-e $partial) { 1469 if (not -s $partial) { 1470 print STDERR "removing empty $partial\n"; 1471 unlink $partial or die "cannot unlink $partial: $!"; 1472 } 1473 else { 1474 die <<END 1475$partial already exists, remove it or try renaming to $filename and 1476resuming the download of <$url> by hand. 1477 1478END 1479 ; 1480 } 1481 } 1482 1483 print STDERR <<END 1484Trying to download <$url>. 1485With a slow network link this could fail; it might be better to 1486download the file by hand and save it as 1487$filename. 1488 1489END 1490 ; 1491 # For downloading we use LWP 1492 # 1493 my $ua = LWP::UserAgent->new(); 1494 $ua->env_proxy(); 1495 $ua->show_progress(1); 1496 1497 my $req = HTTP::Request->new(GET => $url); 1498 $req->authorization_basic('anonymous', 'tv_imdb'); 1499 1500 my $resp = $ua->request($req, $filename); 1501 my $got_size = -s $filename; 1502 if (defined $resp and $resp->is_success ) { 1503 die if not $got_size; 1504 print STDERR "<$url>\n\t-> $filename, success\n\n"; 1505 } 1506 else { 1507 my $msg = "failed to download $url to $filename"; 1508 $msg .= ", http response code: ".$resp->status_line if defined $resp; 1509 warn $msg; 1510 if ($got_size) { 1511 warn "renaming $filename -> $partial\n"; 1512 rename $filename, $partial 1513 or die "cannot rename $filename to $partial: $!"; 1514 warn "You might try continuing the download of <$url> manually.\n"; 1515 } 1516 exit(1); 1517 } 1518 } 1519 $self->{downloadMissingFiles} = 0; 1520 goto CHECK_FILES; 1521 } 1522 1523 if ( %missingListFiles ) { 1524 print STDERR "tv_imdb: requires you to download the above files from ftp.imdb.com\n"; 1525 print STDERR " see http://www.imdb.com/interfaces for details\n"; 1526 print STDERR " or try the --download option\n"; 1527 #return(undef); 1528 return 1; 1529 } 1530 1531 return 0; 1532} 1533 1534sub redirect($$) 1535{ 1536 my ($self, $file)=@_; 1537 1538 if ( defined($file) ) { 1539 if ( !open($self->{logfd}, "> $file") ) { 1540 print STDERR "$file:$!\n"; 1541 return(0); 1542 } 1543 $self->{errorCountInLog}=0; 1544 } 1545 else { 1546 close($self->{logfd}); 1547 $self->{logfd}=undef; 1548 } 1549 return(1); 1550} 1551 1552sub error($$) 1553{ 1554 my $self=shift; 1555 if ( defined($self->{logfd}) ) { 1556 print {$self->{logfd}} $_[0]."\n"; 1557 $self->{errorCountInLog}++; 1558 } 1559 else { 1560 print STDERR $_[0]."\n"; 1561 } 1562} 1563 1564sub status($$) 1565{ 1566 my $self=shift; 1567 1568 if ( $self->{verbose} ) { 1569 print STDERR $_[0]."\n"; 1570 } 1571} 1572 1573sub withThousands ($) 1574{ 1575 my ($val) = @_; 1576 $val =~ s/(\d{1,3}?)(?=(\d{3})+$)/$1,/g; 1577 return $val; 1578} 1579 1580use XMLTV::Gunzip; 1581use IO::File; 1582 1583sub openMaybeGunzip($) 1584{ 1585 for ( shift ) { 1586 return gunzip_open($_) if m/\.gz$/; 1587 return new IO::File("< $_"); 1588 } 1589} 1590 1591sub closeMaybeGunzip($$) 1592{ 1593 if ( $_[0]=~m/\.gz$/o ) { 1594 # Would close($fh) but that causes segfaults on my system. 1595 # Investigating, but in the meantime just leave it open. 1596 # 1597 #return gunzip_close($_[1]); 1598 } 1599 1600 # Apparently this can also segfault (wtf?). 1601 #return close($_[1]); 1602} 1603 1604sub readMoviesOrGenres($$$$) 1605{ 1606 my ($self, $whichMoviesOrGenres, $countEstimate, $file)=@_; 1607 my $startTime=time(); 1608 my $header; 1609 my $whatAreWeParsing; 1610 my $lineCount=0; 1611 1612 if ( $whichMoviesOrGenres eq "Movies" ) { 1613 $header="MOVIES LIST"; 1614 $whatAreWeParsing=1; 1615 } 1616 elsif ( $whichMoviesOrGenres eq "Genres" ) { 1617 $header="8: THE GENRES LIST"; 1618 $whatAreWeParsing=2; 1619 } 1620 my $fh = openMaybeGunzip($file) || return(-2); 1621 while(<$fh>) { 1622 $lineCount++; 1623 if ( m/^$header/ ) { 1624 if ( !($_=<$fh>) || !m/^===========/o ) { 1625 $self->error("missing ======= after $header at line $lineCount"); 1626 closeMaybeGunzip($file, $fh); 1627 return(-1); 1628 } 1629 if ( !($_=<$fh>) || !m/^\s*$/o ) { 1630 $self->error("missing empty line after ======= at line $lineCount"); 1631 closeMaybeGunzip($file, $fh); 1632 return(-1); 1633 } 1634 last; 1635 } 1636 elsif ( $lineCount > 1000 ) { 1637 $self->error("$file: stopping at line $lineCount, didn't see \"$header\" line"); 1638 closeMaybeGunzip($file, $fh); 1639 return(-1); 1640 } 1641 } 1642 1643 my $progress=Term::ProgressBar->new({name => "parsing $whichMoviesOrGenres", 1644 count => $countEstimate, 1645 ETA => 'linear'}) 1646 if ( $self->{showProgressBar} ); 1647 1648 $progress->minor(0) if ($self->{showProgressBar}); 1649 $progress->max_update_rate(1) if ($self->{showProgressBar}); 1650 my $next_update=0; 1651 1652 my $count=0; 1653 while(<$fh>) { 1654 $lineCount++; 1655 my $line=$_; 1656 #print "read line $lineCount:$line\n"; 1657 1658 # end is line consisting of only '-' 1659 last if ( $line=~m/^\-\-\-\-\-\-\-+/o ); 1660 1661 $line=~s/\n$//o; 1662 1663 my $tab=index($line, "\t"); 1664 if ( $tab != -1 ) { 1665 my $mkey=substr($line, 0, $tab); 1666 1667 next if ($mkey=~m/\s*\{\{SUSPENDED\}\}/o); 1668 1669 if ( $whatAreWeParsing == 2 ) { 1670 # don't see what these are...? 1671 # ignore {{SUSPENDED}} 1672 $mkey=~s/\s*\{\{SUSPENDED\}\}//o; 1673 1674 # ignore {Twelve Angry Men (1954)} 1675 $mkey=~s/\s*\{[^\}]+\}//go; 1676 1677 # skip enties that have {} in them since they're tv episodes 1678 #next if ( $mkey=~s/\s*\{[^\}]+\}$//o ); 1679 1680 my $genre=substr($line, $tab); 1681 1682 # genres sometimes has more than one tab 1683 $genre=~s/^\t+//og; 1684 if ( defined($self->{movies}{$mkey}) ) { 1685 $self->{movies}{$mkey}.="|".$genre; 1686 } 1687 else { 1688 $self->{movies}{$mkey}=$genre; 1689 # returned count is number of unique titles found 1690 $count++; 1691 } 1692 } 1693 else { 1694 push(@{$self->{movies}}, $mkey); 1695 # returned count is number of titles found 1696 $count++; 1697 } 1698 1699 if ( $self->{showProgressBar} ) { 1700 # re-adjust target so progress bar doesn't seem too wonky 1701 if ( $count > $countEstimate ) { 1702 $countEstimate = $progress->target($count+1000); 1703 $next_update=$progress->update($count); 1704 } 1705 elsif ( $count > $next_update ) { 1706 $next_update=$progress->update($count); 1707 } 1708 } 1709 } 1710 else { 1711 $self->error("$file:$lineCount: unrecognized format (missing tab)"); 1712 $next_update=$progress->update($count) if ($self->{showProgressBar}); 1713 } 1714 } 1715 $progress->update($countEstimate) if ($self->{showProgressBar}); 1716 1717 $self->status(sprintf("parsing $whichMoviesOrGenres found ".withThousands($count)." titles in ". 1718 withThousands($lineCount)." lines in %d seconds",time()-$startTime)); 1719 1720 closeMaybeGunzip($file, $fh); 1721 return($count); 1722} 1723 1724sub readCastOrDirectors($$$) 1725{ 1726 my ($self, $whichCastOrDirector, $castCountEstimate, $file)=@_; 1727 my $startTime=time(); 1728 1729 my $header; 1730 my $whatAreWeParsing; 1731 my $lineCount=0; 1732 1733 if ( $whichCastOrDirector eq "Actors" ) { 1734 $header="THE ACTORS LIST"; 1735 $whatAreWeParsing=1; 1736 } 1737 elsif ( $whichCastOrDirector eq "Actresses" ) { 1738 $header="THE ACTRESSES LIST"; 1739 $whatAreWeParsing=2; 1740 } 1741 elsif ( $whichCastOrDirector eq "Directors" ) { 1742 $header="THE DIRECTORS LIST"; 1743 $whatAreWeParsing=3; 1744 } 1745 else { 1746 die "why are we here ?"; 1747 } 1748 1749 my $fh = openMaybeGunzip($file) || return(-2); 1750 my $progress=Term::ProgressBar->new({name => "parsing $whichCastOrDirector", 1751 count => $castCountEstimate, 1752 ETA => 'linear'}) 1753 if ($self->{showProgressBar}); 1754 $progress->minor(0) if ($self->{showProgressBar}); 1755 $progress->max_update_rate(1) if ($self->{showProgressBar}); 1756 my $next_update=0; 1757 while(<$fh>) { 1758 $lineCount++; 1759 if ( m/^$header/ ) { 1760 if ( !($_=<$fh>) || !m/^===========/o ) { 1761 $self->error("missing ======= after $header at line $lineCount"); 1762 closeMaybeGunzip($file, $fh); 1763 return(-1); 1764 } 1765 if ( !($_=<$fh>) || !m/^\s*$/o ) { 1766 $self->error("missing empty line after ======= at line $lineCount"); 1767 closeMaybeGunzip($file, $fh); 1768 return(-1); 1769 } 1770 if ( !($_=<$fh>) || !m/^Name\s+Titles\s*$/o ) { 1771 $self->error("missing name/titles line after ======= at line $lineCount"); 1772 closeMaybeGunzip($file, $fh); 1773 return(-1); 1774 } 1775 if ( !($_=<$fh>) || !m/^[\s\-]+$/o ) { 1776 $self->error("missing name/titles suffix line after ======= at line $lineCount"); 1777 closeMaybeGunzip($file, $fh); 1778 return(-1); 1779 } 1780 last; 1781 } 1782 elsif ( $lineCount > 1000 ) { 1783 $self->error("$file: stopping at line $lineCount, didn't see \"$header\" line"); 1784 closeMaybeGunzip($file, $fh); 1785 return(-1); 1786 } 1787 } 1788 1789 my $cur_name; 1790 my $count=0; 1791 my $castNames=0; 1792 while(<$fh>) { 1793 $lineCount++; 1794 my $line=$_; 1795 $line=~s/\n$//o; 1796 #$self->status("read line $lineCount:$line"); 1797 1798 # end is line consisting of only '-' 1799 last if ( $line=~m/^\-\-\-\-\-\-\-+/o ); 1800 1801 next if ( length($line) == 0 ); 1802 1803 if ( $line=~s/^([^\t]+)\t+//o ) { 1804 $cur_name=$1; 1805 $castNames++; 1806 1807 if ( $self->{showProgressBar} ) { 1808 # re-adjust target so progress bar doesn't seem too wonky 1809 if ( $castNames > $castCountEstimate ) { 1810 $castCountEstimate = $progress->target($castNames+100); 1811 $next_update=$progress->update($castNames); 1812 } 1813 elsif ( $castNames > $next_update ) { 1814 $next_update=$progress->update($castNames); 1815 } 1816 } 1817 } 1818 1819 my $billing; 1820 my $HostNarrator=""; 1821 if ( $whatAreWeParsing < 3 ) { 1822 # actors or actresses 1823 $billing="9999"; 1824 if ( $line=~s/\s*<(\d+)>//o ) { 1825 $billing=sprintf("%04d", int($1)); 1826 } 1827 1828 if ( (my $start=index($line, " [")) != -1 ) { 1829 #my $end=rindex($line, "]"); 1830 my $ex=substr($line, $start+1); 1831 1832 if ( $ex=~s/Host//o ) { 1833 if ( length($HostNarrator) ) { 1834 $HostNarrator.=","; 1835 } 1836 $HostNarrator.="Host"; 1837 } 1838 if ( $ex=~s/Narrator//o ) { 1839 if ( length($HostNarrator) ) { 1840 $HostNarrator.=","; 1841 } 1842 $HostNarrator.="Narrator"; 1843 } 1844 $line=substr($line, 0, $start); 1845 # ignore character name 1846 } 1847 } 1848 # try ignoring these 1849 next if ($line=~m/\s*\{\{SUSPENDED\}\}/o); 1850 1851 # don't see what these are...? 1852 # ignore {{SUSPENDED}} 1853 $line=~s/\s*\{\{SUSPENDED\}\}//o; 1854 1855 # [honir] this is wrong - this puts cast from all the episodes as though they are in the entire series! 1856 # ##ignore {Twelve Angry Men (1954)} 1857 $line=~s/\s*\{[^\}]+\}//o; 1858 1859 if ( $whatAreWeParsing < 3 ) { 1860 if ( $line=~s/\s*\(aka ([^\)]+)\).*$//o ) { 1861 # $attr=$1; 1862 } 1863 } 1864 if ( $line=~s/ (\(.*)$//o ) { 1865 # $attrs=$1; 1866 } 1867 $line=~s/^\s+//og; 1868 $line=~s/\s+$//og; 1869 1870 if ( $whatAreWeParsing < 3 ) { 1871 if ( $line=~s/\s+Narrator$//o ) { 1872 # ignore 1873 } 1874 } 1875 1876 my $val=$self->{movies}{$line}; 1877 my $name=$cur_name; 1878 if ( length($HostNarrator) ) { 1879 $name.="[$HostNarrator]"; 1880 } 1881 if ( defined($billing) ) { 1882 if ( defined($val) ) { 1883 $self->{movies}{$line}=$val."|$billing:$name"; 1884 } 1885 else { 1886 $self->{movies}{$line}="$billing:$name"; 1887 } 1888 } 1889 else { 1890 if ( defined($val) ) { 1891 $self->{movies}{$line}=$val."|$name"; 1892 } 1893 else { 1894 $self->{movies}{$line}=$name; 1895 } 1896 } 1897 $count++; 1898 } 1899 $progress->update($castCountEstimate) if ($self->{showProgressBar}); 1900 1901 $self->status(sprintf("parsing $whichCastOrDirector found ".withThousands($castNames)." names, ". 1902 withThousands($count)." titles in ".withThousands($lineCount)." lines in %d seconds",time()-$startTime)); 1903 1904 closeMaybeGunzip($file, $fh); 1905 1906 return($castNames); 1907} 1908 1909sub readRatings($$$$) 1910{ 1911 my ($self, $countEstimate, $file)=@_; 1912 my $startTime=time(); 1913 my $lineCount=0; 1914 1915 my $fh = openMaybeGunzip($file) || return(-2); 1916 while(<$fh>) { 1917 $lineCount++; 1918 if ( m/^MOVIE RATINGS REPORT/o ) { 1919 if ( !($_=<$fh>) || !m/^\s*$/o) { 1920 $self->error("missing empty line after \"MOVIE RATINGS REPORT\" at line $lineCount"); 1921 closeMaybeGunzip($file, $fh); 1922 return(-1); 1923 } 1924 if ( !($_=<$fh>) || !m/^New Distribution Votes Rank Title/o ) { 1925 $self->error("missing \"New Distribution Votes Rank Title\" at line $lineCount"); 1926 closeMaybeGunzip($file, $fh); 1927 return(-1); 1928 } 1929 last; 1930 } 1931 elsif ( $lineCount > 1000 ) { 1932 $self->error("$file: stopping at line $lineCount, didn't see \"MOVIE RATINGS REPORT\" line"); 1933 closeMaybeGunzip($file, $fh); 1934 return(-1); 1935 } 1936 } 1937 1938 my $progress=Term::ProgressBar->new({name => "parsing Ratings", 1939 count => $countEstimate, 1940 ETA => 'linear'}) 1941 if ($self->{showProgressBar}); 1942 1943 $progress->minor(0) if ($self->{showProgressBar}); 1944 $progress->max_update_rate(1) if ($self->{showProgressBar}); 1945 my $next_update=0; 1946 1947 my $count=0; 1948 while(<$fh>) { 1949 $lineCount++; 1950 my $line=$_; 1951 #print "read line $lineCount:$line"; 1952 1953 $line=~s/\n$//o; 1954 1955 # skip empty lines (only really appear right before last line ending with ---- 1956 next if ( $line=~m/^\s*$/o ); 1957 # end is line consisting of only '-' 1958 last if ( $line=~m/^\-\-\-\-\-\-\-+/o ); 1959 1960 # e.g. New Distribution Votes Rank Title 1961 # 0000000133 225568 8.9 12 Angry Men (1957) 1962 if ( $line=~s/^\s+([\.|\*|\d]+)\s+(\d+)\s+(\d+)\.(\d+)\s+//o ) { 1963 $self->{movies}{$line}=[$1,$2,"$3.$4"]; 1964 $count++; 1965 if ( $self->{showProgressBar} ) { 1966 # re-adjust target so progress bar doesn't seem too wonky 1967 if ( $count > $countEstimate ) { 1968 $countEstimate = $progress->target($count+1000); 1969 $next_update=$progress->update($count); 1970 } 1971 elsif ( $count > $next_update ) { 1972 $next_update=$progress->update($count); 1973 } 1974 } 1975 } 1976 else { 1977 $self->error("$file:$lineCount: unrecognized format"); 1978 $next_update=$progress->update($count) if ($self->{showProgressBar}); 1979 } 1980 } 1981 $progress->update($countEstimate) if ($self->{showProgressBar}); 1982 1983 $self->status(sprintf("parsing Ratings found ".withThousands($count)." titles in ". 1984 withThousands($lineCount)." lines in %d seconds",time()-$startTime)); 1985 1986 closeMaybeGunzip($file, $fh); 1987 return($count); 1988} 1989 1990sub readKeywords($$$$) 1991{ 1992 my ($self, $countEstimate, $file)=@_; 1993 my $startTime=time(); 1994 my $lineCount=0; 1995 1996 my $fh = openMaybeGunzip($file) || return(-2); 1997 while(<$fh>) { 1998 $lineCount++; 1999 2000 if ( m/THE KEYWORDS LIST/ ) { 2001 if ( !($_=<$fh>) || !m/^===========/o ) { 2002 $self->error("missing ======= after \"THE KEYWORDS LIST\" at line $lineCount"); 2003 closeMaybeGunzip($file, $fh); 2004 return(-1); 2005 } 2006 if ( !($_=<$fh>) || !m/^\s*$/o ) { 2007 $self->error("missing empty line after ======= at line $lineCount"); 2008 closeMaybeGunzip($file, $fh); 2009 return(-1); 2010 } 2011 last; 2012 } 2013 elsif ( $lineCount > 100000 ) { 2014 $self->error("$file: stopping at line $lineCount, didn't see \"THE KEYWORDS LIST\" line"); 2015 closeMaybeGunzip($file, $fh); 2016 return(-1); 2017 } 2018 } 2019 2020 my $progress=Term::ProgressBar->new({name => "parsing keywords", 2021 count => $countEstimate, 2022 ETA => 'linear'}) 2023 if ($self->{showProgressBar}); 2024 2025 $progress->minor(0) if ($self->{showProgressBar}); 2026 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2027 my $next_update=0; 2028 2029 my $count=0; 2030 while(<$fh>) { 2031 $lineCount++; 2032 my $line=$_; 2033 chomp($line); 2034 next if ($line =~ m/^\s*$/); 2035 my ($title, $keyword) = ($line =~ m/^(.*)\s+(\S+)\s*$/); 2036 if ( defined($title) and defined($keyword) ) { 2037 2038 my ($episode) = $title =~ m/^.*\s+(\{.*\})$/; 2039 2040 # ignore anything which is an episode (e.g. "{Doctor Who (#10.22)}" ) 2041 if ( !defined $episode || $episode eq '' ) 2042 { 2043 if ( defined($self->{movies}{$title}) ) { 2044 $self->{movies}{$title}.=",".$keyword; 2045 } else { 2046 $self->{movies}{$title}=$keyword; 2047 # returned count is number of unique titles found 2048 $count++; 2049 } 2050 } 2051 2052 if ( $self->{showProgressBar} ) { 2053 # re-adjust target so progress bar doesn't seem too wonky 2054 if ( $count > $countEstimate ) { 2055 $countEstimate = $progress->target($count+1000); 2056 $next_update=$progress->update($count); 2057 } 2058 elsif ( $count > $next_update ) { 2059 $next_update=$progress->update($count); 2060 } 2061 } 2062 } else { 2063 $self->error("$file:$lineCount: unrecognized format \"$line\""); 2064 $next_update=$progress->update($count) if ($self->{showProgressBar}); 2065 } 2066 } 2067 $progress->update($countEstimate) if ($self->{showProgressBar}); 2068 2069 $self->status(sprintf("parsing Keywords found ".withThousands($count)." titles in ". 2070 withThousands($lineCount)." lines in %d seconds",time()-$startTime)); 2071 2072 closeMaybeGunzip($file, $fh); 2073 return($count); 2074} 2075 2076sub readPlots($$$$) 2077{ 2078 my ($self, $countEstimate, $file)=@_; 2079 my $startTime=time(); 2080 my $lineCount=0; 2081 2082 my $fh = openMaybeGunzip($file) || return(-2); 2083 while(<$fh>) { 2084 $lineCount++; 2085 2086 if ( m/PLOT SUMMARIES LIST/ ) { 2087 if ( !($_=<$fh>) || !m/^===========/o ) { 2088 $self->error("missing ======= after \"PLOT SUMMARIES LIST\" at line $lineCount"); 2089 closeMaybeGunzip($file, $fh); 2090 return(-1); 2091 } 2092 if ( !($_=<$fh>) || !m/^-----------/o ) { 2093 $self->error("missing ------- line after ======= at line $lineCount"); 2094 closeMaybeGunzip($file, $fh); 2095 return(-1); 2096 } 2097 last; 2098 } 2099 elsif ( $lineCount > 500 ) { 2100 $self->error("$file: stopping at line $lineCount, didn't see \"PLOT SUMMARIES LIST\" line"); 2101 closeMaybeGunzip($file, $fh); 2102 return(-1); 2103 } 2104 } 2105 2106 my $progress=Term::ProgressBar->new({name => "parsing plots", 2107 count => $countEstimate, 2108 ETA => 'linear'}) 2109 if ($self->{showProgressBar}); 2110 2111 $progress->minor(0) if ($self->{showProgressBar}); 2112 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2113 my $next_update=0; 2114 2115 my $count=0; 2116 while(<$fh>) { 2117 $lineCount++; 2118 my $line=$_; 2119 chomp($line); 2120 next if ($line =~ m/^\s*$/); 2121 my ($title, $episode) = ($line =~ m/^MV:\s(.*?)\s?(\{.*\})?$/); 2122 if ( defined($title) ) { 2123 2124 # ignore anything which is an episode (e.g. "{Doctor Who (#10.22)}" ) 2125 if ( !defined $episode || $episode eq '' ) 2126 { 2127 my $plot = ''; 2128 LOOP: 2129 while (1) { 2130 if ( $line = <$fh> ) { 2131 $lineCount++; 2132 chomp($line); 2133 next if ($line =~ m/^\s*$/); 2134 if ( $line =~ m/PL:\s(.*)$/ ) { # plot summary is a number of lines starting "PL:" 2135 $plot .= ($plot ne ''?' ':'') . $1; 2136 } 2137 last LOOP if ( $line =~ m/BY:\s(.*)$/ ); # the author line "BY:" signals the end of the plot summary 2138 } else { 2139 last LOOP; 2140 } 2141 } 2142 2143 if ( !defined($self->{movies}{$title}) ) { 2144 # ensure there's no tab chars in the plot or else the db stage will barf 2145 $plot =~ s/\t//og; 2146 $self->{movies}{$title}=$plot; 2147 # returned count is number of unique titles found 2148 $count++; 2149 } 2150 } 2151 2152 if ( $self->{showProgressBar} ) { 2153 # re-adjust target so progress bar doesn't seem too wonky 2154 if ( $count > $countEstimate ) { 2155 $countEstimate = $progress->target($count+1000); 2156 $next_update=$progress->update($count); 2157 } 2158 elsif ( $count > $next_update ) { 2159 $next_update=$progress->update($count); 2160 } 2161 } 2162 } else { 2163 # skip lines up to the next "MV:" 2164 if ($line !~ m/^(---|PL:|BY:)/ ) { 2165 $self->error("$file:$lineCount: unrecognized format \"$line\""); 2166 } 2167 $next_update=$progress->update($count) if ($self->{showProgressBar}); 2168 } 2169 } 2170 $progress->update($countEstimate) if ($self->{showProgressBar}); 2171 2172 $self->status(sprintf("parsing Plots found $count ".withThousands($count)." in ". 2173 withThousands($lineCount)." lines in %d seconds",time()-$startTime)); 2174 2175 closeMaybeGunzip($file, $fh); 2176 return($count); 2177} 2178 2179sub stageComplete($) 2180{ 2181 my ($self, $stage)=@_; 2182 2183 if ( -f "$self->{imdbDir}/stage$stage.data" ) { 2184 return(1); 2185 } 2186 return(0); 2187} 2188 2189sub dbinfoLoad($) 2190{ 2191 my $self=shift; 2192 2193 my $ret=XMLTV::IMDB::loadDBInfo($self->{moviedbInfo}); 2194 if ( ref $ret eq 'SCALAR' ) { 2195 return($ret); 2196 } 2197 $self->{dbinfo}=$ret; 2198 return(undef); 2199} 2200 2201sub dbinfoAdd($$$) 2202{ 2203 my ($self, $key, $value)=@_; 2204 $self->{dbinfo}->{$key}=$value; 2205} 2206 2207sub dbinfoGet($$$) 2208{ 2209 my ($self, $key, $defaultValue)=@_; 2210 if ( defined($self->{dbinfo}->{$key}) ) { 2211 return($self->{dbinfo}->{$key}); 2212 } 2213 return($defaultValue); 2214} 2215 2216sub dbinfoSave($) 2217{ 2218 my $self=shift; 2219 open(INFO, "> $self->{moviedbInfo}") || return(1); 2220 for (sort keys %{$self->{dbinfo}}) { 2221 print INFO "".$_.":".$self->{dbinfo}->{$_}."\n"; 2222 } 2223 close(INFO); 2224 return(0); 2225} 2226 2227sub dbinfoGetFileSize($$) 2228{ 2229 my ($self, $key)=@_; 2230 2231 if ( !defined($self->{imdbListFiles}->{$key}) ) { 2232 die ("invalid call"); 2233 } 2234 my $fileSize=int(-s "$self->{imdbListFiles}->{$key}"); 2235 2236 # if compressed, then attempt to run gzip -l 2237 if ( $self->{imdbListFiles}->{$key}=~m/.gz$/) { 2238 if ( open(my $fd, "gzip -l ".$self->{imdbListFiles}->{$key}."|") ) { 2239 # if parse fails, then defalt to wild ass guess of compression of 65% 2240 $fileSize=int(($fileSize*100)/(100-65)); 2241 2242 while(<$fd>) { 2243 if ( m/^\s*\d+\s+(\d+)/ ) { 2244 $fileSize=$1; 2245 } 2246 } 2247 close($fd); 2248 } 2249 else { 2250 # wild ass guess of compression of 65% 2251 $fileSize=int(($fileSize*100)/(100-65)); 2252 } 2253 } 2254 return($fileSize); 2255} 2256 2257sub dbinfoCalcEstimate($$$) 2258{ 2259 my ($self, $key, $estimateSizePerEntry)=@_; 2260 2261 my $fileSize=$self->dbinfoGetFileSize($key); 2262 2263 my $countEstimate=int($fileSize/$estimateSizePerEntry); 2264 2265 $self->dbinfoAdd($key."_list_file", $self->{imdbListFiles}->{$key}); 2266 $self->dbinfoAdd($key."_list_file_size", int(-s "$self->{imdbListFiles}->{$key}")); 2267 $self->dbinfoAdd($key."_list_file_size_uncompressed", $fileSize); 2268 $self->dbinfoAdd($key."_list_count_estimate", $countEstimate); 2269 return($countEstimate); 2270} 2271 2272sub dbinfoCalcBytesPerEntry($$$) 2273{ 2274 my ($self, $key, $calcActualForThisNumber)=@_; 2275 2276 my $fileSize=$self->dbinfoGetFileSize($key); 2277 2278 return(int($fileSize/$calcActualForThisNumber)); 2279} 2280 2281sub invokeStage($$) 2282{ 2283 my ($self, $stage)=@_; 2284 2285 my $startTime=time(); 2286 if ( $stage == 1 ) { 2287 $self->status("parsing Movies list for stage $stage.."); 2288 my $countEstimate=$self->dbinfoCalcEstimate("movies", 47); 2289 2290 my $num=$self->readMoviesOrGenres("Movies", $countEstimate, "$self->{imdbListFiles}->{movies}"); 2291 if ( $num < 0 ) { 2292 if ( $num == -2 ) { 2293 $self->error("you need to download $self->{imdbListFiles}->{movies} from ftp.imdb.com"); 2294 } 2295 return(1); 2296 } 2297 elsif ( abs($num - $countEstimate) > $countEstimate*.10 ) { 2298 my $better=$self->dbinfoCalcBytesPerEntry("movies", $num); 2299 $self->status("ARG estimate of $countEstimate for movies needs updating, found $num ($better bytes/entry)"); 2300 } 2301 $self->dbinfoAdd("db_stat_movie_count", "$num"); 2302 2303 $self->status("writing stage1 data .."); 2304 { 2305 my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0); 2306 my $progress=Term::ProgressBar->new({name => "writing titles", 2307 count => $countEstimate, 2308 ETA => 'linear'}) 2309 if ($self->{showProgressBar}); 2310 $progress->minor(0) if ($self->{showProgressBar}); 2311 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2312 my $next_update=0; 2313 2314 open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!"; 2315 my $count=0; 2316 for my $movie (@{$self->{movies}}) { 2317 print OUT "$movie\n"; 2318 2319 $count++; 2320 if ($self->{showProgressBar}) { 2321 # re-adjust target so progress bar doesn't seem too wonky 2322 if ( $count > $countEstimate ) { 2323 $countEstimate = $progress->target($count+100); 2324 $next_update=$progress->update($count); 2325 } 2326 elsif ( $count > $next_update ) { 2327 $next_update=$progress->update($count); 2328 } 2329 } 2330 } 2331 $progress->update($countEstimate) if ($self->{showProgressBar}); 2332 close(OUT); 2333 delete($self->{movies}); 2334 } 2335 } 2336 elsif ( $stage == 2 ) { 2337 $self->status("parsing Directors list for stage $stage.."); 2338 2339 my $countEstimate=$self->dbinfoCalcEstimate("directors", 258); 2340 2341 my $num=$self->readCastOrDirectors("Directors", $countEstimate, "$self->{imdbListFiles}->{directors}"); 2342 if ( $num < 0 ) { 2343 if ( $num == -2 ) { 2344 $self->error("you need to download $self->{imdbListFiles}->{directors} from ftp.imdb.com (see http://www.imdb.com/interfaces)"); 2345 } 2346 return(1); 2347 } 2348 elsif ( abs($num - $countEstimate) > $countEstimate*.10 ) { 2349 my $better=$self->dbinfoCalcBytesPerEntry("directors", $num); 2350 $self->status("ARG estimate of $countEstimate for directors needs updating, found $num ($better bytes/entry)"); 2351 } 2352 $self->dbinfoAdd("db_stat_director_count", "$num"); 2353 2354 $self->status("writing stage2 data .."); 2355 { 2356 my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0); 2357 my $progress=Term::ProgressBar->new({name => "writing directors", 2358 count => $countEstimate, 2359 ETA => 'linear'}) 2360 if ($self->{showProgressBar}); 2361 $progress->minor(0) if ($self->{showProgressBar}); 2362 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2363 my $next_update=0; 2364 2365 my $count=0; 2366 open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!"; 2367 for my $key (keys %{$self->{movies}}) { 2368 my %dir; 2369 for (split('\|', $self->{movies}{$key})) { 2370 $dir{$_}++; 2371 } 2372 my @list; 2373 for (keys %dir) { 2374 push(@list, sprintf("%03d:%s", $dir{$_}, $_)); 2375 } 2376 my $value=""; 2377 for my $c (reverse sort {$a cmp $b} @list) { 2378 my ($num, $name)=split(':', $c); 2379 $value.=$name."|"; 2380 } 2381 $value=~s/\|$//o; 2382 print OUT "$key\t$value\n"; 2383 2384 $count++; 2385 if ($self->{showProgressBar}) { 2386 # re-adjust target so progress bar doesn't seem too wonky 2387 if ( $count > $countEstimate ) { 2388 $countEstimate = $progress->target($count+100); 2389 $next_update=$progress->update($count); 2390 } 2391 elsif ( $count > $next_update ) { 2392 $next_update=$progress->update($count); 2393 } 2394 } 2395 } 2396 $progress->update($countEstimate) if ($self->{showProgressBar}); 2397 close(OUT); 2398 delete($self->{movies}); 2399 } 2400 #unlink("$self->{imdbDir}/stage1.data"); 2401 } 2402 elsif ( $stage == 3 ) { 2403 $self->status("parsing Actors list for stage $stage.."); 2404 2405 #print "re-reading movies into memory for reverse lookup..\n"; 2406 my $countEstimate=$self->dbinfoCalcEstimate("actors", 449); 2407 2408 my $num=$self->readCastOrDirectors("Actors", $countEstimate, "$self->{imdbListFiles}->{actors}"); 2409 if ( $num < 0 ) { 2410 if ( $num == -2 ) { 2411 $self->error("you need to download $self->{imdbListFiles}->{actors} from ftp.imdb.com (see http://www.imdb.com/interfaces)"); 2412 } 2413 return(1); 2414 } 2415 elsif ( abs($num - $countEstimate) > $countEstimate*.10 ) { 2416 my $better=$self->dbinfoCalcBytesPerEntry("actors", $num); 2417 $self->status("ARG estimate of $countEstimate for actors needs updating, found $num ($better bytes/entry)"); 2418 } 2419 $self->dbinfoAdd("db_stat_actor_count", "$num"); 2420 2421 $self->status("writing stage3 data .."); 2422 { 2423 my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0); 2424 my $progress=Term::ProgressBar->new({name => "writing actors", 2425 count => $countEstimate, 2426 ETA => 'linear'}) 2427 if ($self->{showProgressBar}); 2428 $progress->minor(0) if ($self->{showProgressBar}); 2429 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2430 my $next_update=0; 2431 2432 my $count=0; 2433 open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!"; 2434 for my $key (keys %{$self->{movies}}) { 2435 print OUT "$key\t$self->{movies}{$key}\n"; 2436 2437 $count++; 2438 if ($self->{showProgressBar}) { 2439 # re-adjust target so progress bar doesn't seem too wonky 2440 if ( $count > $countEstimate ) { 2441 $countEstimate = $progress->target($count+100); 2442 $next_update=$progress->update($count); 2443 } 2444 elsif ( $count > $next_update ) { 2445 $next_update=$progress->update($count); 2446 } 2447 } 2448 } 2449 $progress->update($countEstimate) if ($self->{showProgressBar}); 2450 close(OUT); 2451 delete($self->{movies}); 2452 } 2453 } 2454 elsif ( $stage == 4 ) { 2455 $self->status("parsing Actresses list for stage $stage.."); 2456 2457 my $countEstimate=$self->dbinfoCalcEstimate("actresses", 483); 2458 my $num=$self->readCastOrDirectors("Actresses", $countEstimate, "$self->{imdbListFiles}->{actresses}"); 2459 if ( $num < 0 ) { 2460 if ( $num == -2 ) { 2461 $self->error("you need to download $self->{imdbListFiles}->{actresses} from ftp.imdb.com (see http://www.imdb.com/interfaces)"); 2462 } 2463 return(1); 2464 } 2465 elsif ( abs($num - $countEstimate) > $countEstimate*.10 ) { 2466 my $better=$self->dbinfoCalcBytesPerEntry("actresses", $num); 2467 $self->status("ARG estimate of $countEstimate for actresses needs updating, found $num ($better bytes/entry)"); 2468 } 2469 $self->dbinfoAdd("db_stat_actress_count", "$num"); 2470 2471 $self->status("writing stage4 data .."); 2472 { 2473 my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0); 2474 my $progress=Term::ProgressBar->new({name => "writing actresses", 2475 count => $countEstimate, 2476 ETA => 'linear'}) 2477 if ($self->{showProgressBar}); 2478 $progress->minor(0) if ($self->{showProgressBar}); 2479 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2480 my $next_update=0; 2481 2482 my $count=0; 2483 open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!"; 2484 for my $key (keys %{$self->{movies}}) { 2485 print OUT "$key\t$self->{movies}{$key}\n"; 2486 $count++; 2487 if ($self->{showProgressBar}) { 2488 # re-adjust target so progress bar doesn't seem too wonky 2489 if ( $count > $countEstimate ) { 2490 $countEstimate = $progress->target($count+100); 2491 $next_update=$progress->update($count); 2492 } 2493 elsif ( $count > $next_update ) { 2494 $next_update=$progress->update($count); 2495 } 2496 } 2497 } 2498 $progress->update($countEstimate) if ($self->{showProgressBar}); 2499 close(OUT); 2500 delete($self->{movies}); 2501 } 2502 #unlink("$self->{imdbDir}/stage3.data"); 2503 } 2504 elsif ( $stage == 5 ) { 2505 $self->status("parsing Genres list for stage $stage.."); 2506 my $countEstimate=$self->dbinfoCalcEstimate("genres", 68); 2507 2508 my $num=$self->readMoviesOrGenres("Genres", $countEstimate, "$self->{imdbListFiles}->{genres}"); 2509 if ( $num < 0 ) { 2510 if ( $num == -2 ) { 2511 $self->error("you need to download $self->{imdbListFiles}->{genres} from ftp.imdb.com"); 2512 } 2513 return(1); 2514 } 2515 elsif ( abs($num - $countEstimate) > $countEstimate*.10 ) { 2516 my $better=$self->dbinfoCalcBytesPerEntry("genres", $num); 2517 $self->status("ARG estimate of $countEstimate for genres needs updating, found $num ($better bytes/entry)"); 2518 } 2519 $self->dbinfoAdd("db_stat_genres_count", "$num"); 2520 2521 $self->status("writing stage5 data .."); 2522 { 2523 my $countEstimate=$self->dbinfoGet("db_stat_genres_count", 0); 2524 my $progress=Term::ProgressBar->new({name => "writing genres", 2525 count => $countEstimate, 2526 ETA => 'linear'}) 2527 if ($self->{showProgressBar}); 2528 $progress->minor(0) if ($self->{showProgressBar}); 2529 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2530 my $next_update=0; 2531 2532 open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!"; 2533 my $count=0; 2534 for my $movie (keys %{$self->{movies}}) { 2535 print OUT "$movie\t$self->{movies}->{$movie}\n"; 2536 2537 $count++; 2538 if ($self->{showProgressBar}) { 2539 # re-adjust target so progress bar doesn't seem too wonky 2540 if ( $count > $countEstimate ) { 2541 $countEstimate = $progress->target($count+100); 2542 $next_update=$progress->update($count); 2543 } 2544 elsif ( $count > $next_update ) { 2545 $next_update=$progress->update($count); 2546 } 2547 } 2548 } 2549 $progress->update($countEstimate) if ($self->{showProgressBar}); 2550 close(OUT); 2551 delete($self->{movies}); 2552 } 2553 } 2554 elsif ( $stage == 6 ) { 2555 $self->status("parsing Ratings list for stage $stage.."); 2556 my $countEstimate=$self->dbinfoCalcEstimate("ratings", 68); 2557 2558 my $num=$self->readRatings($countEstimate, "$self->{imdbListFiles}->{ratings}"); 2559 if ( $num < 0 ) { 2560 if ( $num == -2 ) { 2561 $self->error("you need to download $self->{imdbListFiles}->{ratings} from ftp.imdb.com"); 2562 } 2563 return(1); 2564 } 2565 elsif ( abs($num - $countEstimate) > $countEstimate*.10 ) { 2566 my $better=$self->dbinfoCalcBytesPerEntry("ratings", $num); 2567 $self->status("ARG estimate of $countEstimate for ratings needs updating, found $num ($better bytes/entry)"); 2568 } 2569 $self->dbinfoAdd("db_stat_ratings_count", "$num"); 2570 2571 $self->status("writing stage6 data .."); 2572 { 2573 my $countEstimate=$self->dbinfoGet("db_stat_ratings_count", 0); 2574 my $progress=Term::ProgressBar->new({name => "writing ratings", 2575 count => $countEstimate, 2576 ETA => 'linear'}) 2577 if ($self->{showProgressBar}); 2578 $progress->minor(0) if ($self->{showProgressBar}); 2579 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2580 my $next_update=0; 2581 2582 open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!"; 2583 my $count=0; 2584 for my $movie (keys %{$self->{movies}}) { 2585 my @value=@{$self->{movies}->{$movie}}; 2586 print OUT "$movie\t$value[0]\t$value[1]\t$value[2]\n"; 2587 2588 $count++; 2589 if ($self->{showProgressBar}) { 2590 # re-adjust target so progress bar doesn't seem too wonky 2591 if ( $count > $countEstimate ) { 2592 $countEstimate = $progress->target($count+100); 2593 $next_update=$progress->update($count); 2594 } 2595 elsif ( $count > $next_update ) { 2596 $next_update=$progress->update($count); 2597 } 2598 } 2599 } 2600 $progress->update($countEstimate) if ($self->{showProgressBar}); 2601 close(OUT); 2602 delete($self->{movies}); 2603 } 2604 } 2605 elsif ( $stage == 7 ) { 2606 $self->status("parsing Keywords list for stage $stage.."); 2607 2608 if ( !defined($self->{imdbListFiles}->{keywords}) ) { 2609 $self->status("no keywords file downloaded, see --with-keywords details in documentation"); 2610 return(0); 2611 } 2612 2613 my $countEstimate=5630000; 2614 my $num=$self->readKeywords($countEstimate, "$self->{imdbListFiles}->{keywords}"); 2615 if ( $num < 0 ) { 2616 if ( $num == -2 ) { 2617 $self->error("you need to download $self->{imdbListFiles}->{keywords} from ftp.imdb.com"); 2618 } 2619 return(1); 2620 } 2621 elsif ( abs($num - $countEstimate) > $countEstimate*.05 ) { 2622 $self->status("ARG estimate of $countEstimate for keywords needs updating, found $num"); 2623 } 2624 $self->dbinfoAdd("keywords_list_file", "$self->{imdbListFiles}->{keywords}"); 2625 $self->dbinfoAdd("keywords_list_file_size", -s "$self->{imdbListFiles}->{keywords}"); 2626 $self->dbinfoAdd("db_stat_keywords_count", "$num"); 2627 2628 $self->status("writing stage$stage data .."); 2629 { 2630 my $countEstimate=$self->dbinfoGet("db_stat_keywords_count", 0); 2631 my $progress=Term::ProgressBar->new({name => "writing keywords", 2632 count => $countEstimate, 2633 ETA => 'linear'}) 2634 if ($self->{showProgressBar}); 2635 $progress->minor(0) if ($self->{showProgressBar}); 2636 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2637 my $next_update=0; 2638 2639 open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!"; 2640 2641 my $count=0; 2642 for my $movie (keys %{$self->{movies}}) { 2643 print OUT "$movie\t$self->{movies}->{$movie}\n"; 2644 2645 $count++; 2646 if ($self->{showProgressBar}) { 2647 # re-adjust target so progress bar doesn't seem too wonky 2648 if ( $count > $countEstimate ) { 2649 $countEstimate = $progress->target($count+100); 2650 $next_update=$progress->update($count); 2651 } 2652 elsif ( $count > $next_update ) { 2653 $next_update=$progress->update($count); 2654 } 2655 } 2656 } 2657 $progress->update($countEstimate) if ($self->{showProgressBar}); 2658 close(OUT); 2659 delete($self->{movies}); 2660 } 2661 } 2662 elsif ( $stage == 8 ) { 2663 $self->status("parsing Plot list for stage $stage.."); 2664 2665 if ( !defined($self->{imdbListFiles}->{plot}) ) { 2666 $self->status("no plot file downloaded, see --with-plot details in documentation"); 2667 return(0); 2668 } 2669 2670 my $countEstimate=222222; 2671 my $num=$self->readPlots($countEstimate, "$self->{imdbListFiles}->{plot}"); 2672 if ( $num < 0 ) { 2673 if ( $num == -2 ) { 2674 $self->error("you need to download $self->{imdbListFiles}->{plot} from ftp.imdb.com"); 2675 } 2676 return(1); 2677 } 2678 elsif ( abs($num - $countEstimate) > $countEstimate*.05 ) { 2679 $self->status("ARG estimate of $countEstimate for plots needs updating, found $num"); 2680 } 2681 $self->dbinfoAdd("plots_list_file", "$self->{imdbListFiles}->{plot}"); 2682 $self->dbinfoAdd("plots_list_file_size", -s "$self->{imdbListFiles}->{plot}"); 2683 $self->dbinfoAdd("db_stat_plots_count", "$num"); 2684 2685 $self->status("writing stage$stage data .."); 2686 { 2687 my $countEstimate=$self->dbinfoGet("db_stat_plots_count", 0); 2688 my $progress=Term::ProgressBar->new({name => "writing plots", 2689 count => $countEstimate, 2690 ETA => 'linear'}) 2691 if ($self->{showProgressBar}); 2692 $progress->minor(0) if ($self->{showProgressBar}); 2693 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2694 my $next_update=0; 2695 2696 open(OUT, "> $self->{imdbDir}/stage$stage.data") || die "$self->{imdbDir}/stage$stage.data:$!"; 2697 2698 my $count=0; 2699 for my $movie (keys %{$self->{movies}}) { 2700 print OUT "$movie\t$self->{movies}->{$movie}\n"; 2701 2702 $count++; 2703 if ($self->{showProgressBar}) { 2704 # re-adjust target so progress bar doesn't seem too wonky 2705 if ( $count > $countEstimate ) { 2706 $countEstimate = $progress->target($count+100); 2707 $next_update=$progress->update($count); 2708 } 2709 elsif ( $count > $next_update ) { 2710 $next_update=$progress->update($count); 2711 } 2712 } 2713 } 2714 $progress->update($countEstimate) if ($self->{showProgressBar}); 2715 close(OUT); 2716 delete($self->{movies}); 2717 } 2718 } 2719 elsif ( $stage == $self->{stageLast} ) { 2720 my $tab=sprintf("\t"); 2721 2722 $self->status("indexing all previous stage's data for stage ".$self->{stageLast}.".."); 2723 2724 $self->status("parsing stage 1 data (movie list).."); 2725 my %movies; 2726 { 2727 my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0); 2728 my $progress=Term::ProgressBar->new({name => "reading titles", 2729 count => $countEstimate, 2730 ETA => 'linear'}) 2731 if ($self->{showProgressBar}); 2732 $progress->minor(0) if ($self->{showProgressBar}); 2733 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2734 my $next_update=0; 2735 2736 open(IN, "< $self->{imdbDir}/stage1.data") || die "$self->{imdbDir}/stage1.data:$!"; 2737 while(<IN>) { 2738 chop(); 2739 $movies{$_}=""; 2740 2741 if ($self->{showProgressBar}) { 2742 # re-adjust target so progress bar doesn't seem too wonky 2743 if ( $. > $countEstimate ) { 2744 $countEstimate = $progress->target($.+100); 2745 $next_update=$progress->update($.); 2746 } 2747 elsif ( $. > $next_update ) { 2748 $next_update=$progress->update($.); 2749 } 2750 } 2751 } 2752 close(IN); 2753 $progress->update($countEstimate) if ($self->{showProgressBar}); 2754 } 2755 2756 $self->status("merging in stage 2 data (directors).."); 2757 if ( 1 ) { 2758 my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0); 2759 my $progress=Term::ProgressBar->new({name => "merging directors", 2760 count => $countEstimate, 2761 ETA => 'linear'}) 2762 if ($self->{showProgressBar}); 2763 $progress->minor(0) if ($self->{showProgressBar}); 2764 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2765 my $next_update=0; 2766 2767 open(IN, "< $self->{imdbDir}/stage2.data") || die "$self->{imdbDir}/stage2.data:$!"; 2768 while(<IN>) { 2769 chop(); 2770 s/^([^\t]+)\t//o; 2771 if ( !defined($movies{$1}) ) { 2772 $self->error("directors list references unidentified title '$1'"); 2773 next; 2774 } 2775 $movies{$1}=$_; 2776 2777 if ($self->{showProgressBar}) { 2778 # re-adjust target so progress bar doesn't seem too wonky 2779 if ( $. > $countEstimate ) { 2780 $countEstimate = $progress->target($.+100); 2781 $next_update=$progress->update($.); 2782 } 2783 elsif ( $. > $next_update ) { 2784 $next_update=$progress->update($.); 2785 } 2786 } 2787 } 2788 $progress->update($countEstimate) if ($self->{showProgressBar}); 2789 close(IN); 2790 } 2791 2792 if ( 1 ) { 2793 # fill in default for movies we didn't have a director for 2794 for my $key (keys %movies) { 2795 if ( !length($movies{$key})) { 2796 $movies{$key}="<>"; 2797 } 2798 } 2799 } 2800 2801 $self->status("merging in stage 3 data (actors).."); 2802 if ( 1 ) { 2803 my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0); 2804 my $progress=Term::ProgressBar->new({name => "merging actors", 2805 count => $countEstimate, 2806 ETA => 'linear'}) 2807 if ($self->{showProgressBar}); 2808 $progress->minor(0) if ($self->{showProgressBar}); 2809 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2810 my $next_update=0; 2811 2812 open(IN, "< $self->{imdbDir}/stage3.data") || die "$self->{imdbDir}/stage3.data:$!"; 2813 while(<IN>) { 2814 chop(); 2815 s/^([^\t]+)\t//o; 2816 my $dbkey=$1; 2817 my $val=$movies{$dbkey}; 2818 if ( !defined($val) ) { 2819 $self->error("actors list references unidentified title '$dbkey'"); 2820 next; 2821 } 2822 if ( $val=~m/$tab/o ) { 2823 $movies{$dbkey}=$val."|".$_; 2824 } 2825 else { 2826 $movies{$dbkey}=$val.$tab.$_; 2827 } 2828 if ($self->{showProgressBar}) { 2829 # re-adjust target so progress bar doesn't seem too wonky 2830 if ( $. > $countEstimate ) { 2831 $countEstimate = $progress->target($.+100); 2832 $next_update=$progress->update($.); 2833 } 2834 elsif ( $. > $next_update ) { 2835 $next_update=$progress->update($.); 2836 } 2837 } 2838 } 2839 $progress->update($countEstimate) if ($self->{showProgressBar}); 2840 close(IN); 2841 } 2842 2843 $self->status("merging in stage 4 data (actresses).."); 2844 if ( 1 ) { 2845 my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0); 2846 my $progress=Term::ProgressBar->new({name => "merging actresses", 2847 count => $countEstimate, 2848 ETA => 'linear'}) 2849 if ($self->{showProgressBar}); 2850 $progress->minor(0) if ($self->{showProgressBar}); 2851 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2852 my $next_update=0; 2853 2854 open(IN, "< $self->{imdbDir}/stage4.data") || die "$self->{imdbDir}/stage4.data:$!"; 2855 while(<IN>) { 2856 chop(); 2857 s/^([^\t]+)\t//o; 2858 my $dbkey=$1; 2859 my $val=$movies{$dbkey}; 2860 if ( !defined($val) ) { 2861 $self->error("actresses list references unidentified title '$dbkey'"); 2862 next; 2863 } 2864 if ( $val=~m/$tab/o ) { 2865 $movies{$dbkey}=$val."|".$_; 2866 } 2867 else { 2868 $movies{$dbkey}=$val.$tab.$_; 2869 } 2870 if ($self->{showProgressBar}) { 2871 # re-adjust target so progress bar doesn't seem too wonky 2872 if ( $. > $countEstimate ) { 2873 $countEstimate = $progress->target($.+100); 2874 $next_update=$progress->update($.); 2875 } 2876 elsif ( $. > $next_update ) { 2877 $next_update=$progress->update($.); 2878 } 2879 } 2880 } 2881 $progress->update($countEstimate) if ($self->{showProgressBar}); 2882 close(IN); 2883 } 2884 if ( 1 ) { 2885 # fill in placeholder if no actors were found 2886 for my $key (keys %movies) { 2887 if ( !($movies{$key}=~m/$tab/o) ) { 2888 $movies{$key}.=$tab."<>"; 2889 } 2890 } 2891 } 2892 2893 $self->status("merging in stage 5 data (genres).."); 2894 if ( 1 ) { 2895 my $countEstimate=$self->dbinfoGet("db_stat_genres_count", 1); # '1' prevents the spurious "(nothing to do)" msg 2896 my $progress=Term::ProgressBar->new({name => "merging genres", 2897 count => $countEstimate, 2898 ETA => 'linear'}) 2899 if ($self->{showProgressBar}); 2900 $progress->minor(0) if ($self->{showProgressBar}); 2901 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2902 my $next_update=0; 2903 2904 open(IN, "< $self->{imdbDir}/stage5.data") || die "$self->{imdbDir}/stage5.data:$!"; 2905 while(<IN>) { 2906 chop(); 2907 s/^([^\t]+)\t//o; 2908 my $dbkey=$1; 2909 my $genres=$_; 2910 my $val=$movies{$dbkey}; 2911 if ( !defined($val) ) { 2912 $self->error("genres list references unidentified title '$1'"); 2913 next; 2914 } 2915 $movies{$dbkey}.=$tab.$genres; 2916 2917 if ($self->{showProgressBar}) { 2918 # re-adjust target so progress bar doesn't seem too wonky 2919 if ( $. > $countEstimate ) { 2920 $countEstimate = $progress->target($.+100); 2921 $next_update=$progress->update($.); 2922 } 2923 elsif ( $. > $next_update ) { 2924 $next_update=$progress->update($.); 2925 } 2926 } 2927 } 2928 $progress->update($countEstimate) if ($self->{showProgressBar}); 2929 close(IN); 2930 } 2931 2932 if ( 1 ) { 2933 # fill in placeholder if no genres were found 2934 for my $key (keys %movies) { 2935 my $val=$movies{$key}; 2936 my $t=index($val, $tab); 2937 if ( $t == -1 ) { 2938 die "corrupt entry '$key' '$val'"; 2939 } 2940 if ( index($val, $tab, $t+1) == -1 ) { 2941 $movies{$key}.=$tab."<>"; 2942 } 2943 } 2944 } 2945 2946 $self->status("merging in stage 6 data (ratings).."); 2947 if ( 1 ) { 2948 my $countEstimate=$self->dbinfoGet("db_stat_ratings_count", 1); # '1' prevents the spurious "(nothing to do)" msg 2949 my $progress=Term::ProgressBar->new({name => "merging ratings", 2950 count => $countEstimate, 2951 ETA => 'linear'}) 2952 if ($self->{showProgressBar}); 2953 $progress->minor(0) if ($self->{showProgressBar}); 2954 $progress->max_update_rate(1) if ($self->{showProgressBar}); 2955 my $next_update=0; 2956 2957 open(IN, "< $self->{imdbDir}/stage6.data") || die "$self->{imdbDir}/stage6.data:$!"; 2958 while(<IN>) { 2959 chop(); 2960 s/^([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)$//o; 2961 my $dbkey=$1; 2962 my ($ratingDist, $ratingVotes, $ratingRank)=($2,$3,$4); 2963 2964 my $val=$movies{$dbkey}; 2965 if ( !defined($val) ) { 2966 $self->error("ratings list references unidentified title '$1'"); 2967 next; 2968 } 2969 $movies{$dbkey}.=$tab.$ratingDist.$tab.$ratingVotes.$tab.$ratingRank; 2970 2971 if ($self->{showProgressBar}) { 2972 # re-adjust target so progress bar doesn't seem too wonky 2973 if ( $. > $countEstimate ) { 2974 $countEstimate = $progress->target($.+100); 2975 $next_update=$progress->update($.); 2976 } 2977 elsif ( $. > $next_update ) { 2978 $next_update=$progress->update($.); 2979 } 2980 } 2981 } 2982 $progress->update($countEstimate) if ($self->{showProgressBar}); 2983 close(IN); 2984 } 2985 2986 if ( 1 ) { 2987 # fill in placeholder if no genres were found 2988 for my $key (keys %movies) { 2989 my $val=$movies{$key}; 2990 2991 my $t=index($val, $tab); 2992 if ( $t == -1 ) { 2993 die "corrupt entry '$key' '$val'"; 2994 } 2995 my $j=index($val, $tab, $t+1); 2996 if ( $j == -1 ) { 2997 die "corrupt entry '$key' '$val'"; 2998 } 2999 if ( index($val, $tab, $j+1) == -1 ) { 3000 $movies{$key}.=$tab."<>".$tab."<>".$tab."<>"; 3001 } 3002 } 3003 } 3004 3005 $self->status("merging in stage 7 data (keywords).."); 3006 #if ( 1 ) { # this stage is optional 3007 if ( -f "$self->{imdbDir}/stage7.data" ) { 3008 my $countEstimate=$self->dbinfoGet("db_stat_keywords_count", 1); # '1' prevents the spurious "(nothing to do)" msg 3009 my $progress=Term::ProgressBar->new({name => "merging keywords", 3010 count => $countEstimate, 3011 ETA => 'linear'}) 3012 if ($self->{showProgressBar}); 3013 $progress->minor(0) if ($self->{showProgressBar}); 3014 $progress->max_update_rate(1) if ($self->{showProgressBar}); 3015 my $next_update=0; 3016 3017 open(IN, "< $self->{imdbDir}/stage7.data") || die "$self->{imdbDir}/stage7.data:$!"; 3018 while(<IN>) { 3019 chop(); 3020 s/^([^\t]+)\t+//o; 3021 my $dbkey=$1; 3022 my $keywords=$_; 3023 if ( !defined($movies{$dbkey}) ) { 3024 $self->error("keywords list references unidentified title '$1'"); 3025 next; 3026 } 3027 $movies{$dbkey}.=$tab.$keywords; 3028 3029 if ($self->{showProgressBar}) { 3030 # re-adjust target so progress bar doesn't seem too wonky 3031 if ( $. > $countEstimate ) { 3032 $countEstimate = $progress->target($.+100); 3033 $next_update=$progress->update($.); 3034 } 3035 elsif ( $. > $next_update ) { 3036 $next_update=$progress->update($.); 3037 } 3038 } 3039 } 3040 $progress->update($countEstimate) if ($self->{showProgressBar}); 3041 close(IN); 3042 } 3043 3044 if ( 1 ) { 3045 # fill in default for movies we didn't have any keywords for 3046 for my $key (keys %movies) { 3047 my $val=$movies{$key}; 3048 #keyword is 6th entry 3049 my $t = 0; 3050 for my $i (0..4) { 3051 $t=index($val, $tab, $t); 3052 if ( $t == -1 ) { 3053 die "Corrupt entry '$key' '$val'"; 3054 } 3055 $t+=1; 3056 } 3057 if ( index($val, $tab, $t) == -1 ) { 3058 $movies{$key}.=$tab."<>"; 3059 } 3060 } 3061 } 3062 3063 $self->status("merging in stage 8 data (plots).."); 3064 #if ( 1 ) { # this stage is optional 3065 if ( -f "$self->{imdbDir}/stage8.data" ) { 3066 my $countEstimate=$self->dbinfoGet("db_stat_plots_count", 1); # '1' prevents the spurious "(nothing to do)" msg 3067 my $progress=Term::ProgressBar->new({name => "merging plots", 3068 count => $countEstimate, 3069 ETA => 'linear'}) 3070 if ($self->{showProgressBar}); 3071 $progress->minor(0) if ($self->{showProgressBar}); 3072 $progress->max_update_rate(1) if ($self->{showProgressBar}); 3073 my $next_update=0; 3074 3075 open(IN, "< $self->{imdbDir}/stage8.data") || die "$self->{imdbDir}/stage8.data:$!"; 3076 while(<IN>) { 3077 chop(); 3078 s/^([^\t]+)\t+//o; 3079 my $dbkey=$1; 3080 my $plot=$_; 3081 if ( !defined($movies{$dbkey}) ) { 3082 $self->error("plot list references unidentified title '$1'"); 3083 next; 3084 } 3085 $movies{$dbkey}.=$tab.$plot; 3086 3087 if ($self->{showProgressBar}) { 3088 # re-adjust target so progress bar doesn't seem too wonky 3089 if ( $. > $countEstimate ) { 3090 $countEstimate = $progress->target($.+100); 3091 $next_update=$progress->update($.); 3092 } 3093 elsif ( $. > $next_update ) { 3094 $next_update=$progress->update($.); 3095 } 3096 } 3097 } 3098 $progress->update($countEstimate) if ($self->{showProgressBar}); 3099 close(IN); 3100 } 3101 if ( 1 ) { 3102 # fill in default for movies we didn't have any plot for 3103 for my $key (keys %movies) { 3104 my $val=$movies{$key}; 3105 #plot is 7th entry 3106 my $t = 0; 3107 for my $i (0..5) { 3108 $t=index($val, $tab, $t); 3109 if ( $t == -1 ) { 3110 die "Corrupt entry '$key' '$val'"; 3111 } 3112 $t+=1; 3113 } 3114 if ( index($val, $tab, $t) == -1 ) { 3115 $movies{$key}.=$tab."<>"; 3116 } 3117 } 3118 } 3119 3120 #unlink("$self->{imdbDir}/stage1.data"); 3121 #unlink("$self->{imdbDir}/stage2.data"); 3122 #unlink("$self->{imdbDir}/stage3.data"); 3123 3124 # --------------------------------------------------------------------------------------- 3125 3126 3127 # 3128 # note: not all movies end up with a cast, but we include them anyway. 3129 # 3130 3131 my %nmovies; 3132 { 3133 my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0); 3134 my $progress=Term::ProgressBar->new({name => "computing index", 3135 count => $countEstimate, 3136 ETA => 'linear'}) 3137 if ($self->{showProgressBar}); 3138 $progress->minor(0) if ($self->{showProgressBar}); 3139 $progress->max_update_rate(1) if ($self->{showProgressBar}); 3140 my $next_update=0; 3141 3142 my $count=0; 3143 for my $key (keys %movies) { 3144 my $dbkey=$key; 3145 3146 # drop episode information - ex: {Twelve Angry Men (1954)} 3147 $dbkey=~s/\s*\{[^\}]+\}//go; 3148 3149 # todo - this would make things easier 3150 # change double-quotes around title to be (made-for-tv) suffix instead 3151 if ( $dbkey=~m/^\"/o && #" 3152 $dbkey=~m/\"\s*\(/o ) { #" 3153 $dbkey.=" (tv_series)"; 3154 } 3155 # how rude, some entries have (TV) appearing more than once. 3156 $dbkey=~s/\(TV\)\s*\(TV\)$/(TV)/o; 3157 3158 my $qualifier; 3159 if ( $dbkey=~s/\s+\(TV\)$//o ) { 3160 $qualifier="tv_movie"; 3161 } 3162 elsif ( $dbkey=~s/\s+\(mini\) \(tv_series\)$// ) { 3163 $qualifier="tv_mini_series"; 3164 } 3165 elsif ( $dbkey=~s/\s+\(tv_series\)$// ) { 3166 $qualifier="tv_series"; 3167 } 3168 elsif ( $dbkey=~s/\s+\(mini\)$//o ) { 3169 $qualifier="tv_mini_series"; 3170 } 3171 elsif ( $dbkey=~s/\s+\(V\)$//o ) { 3172 $qualifier="video_movie"; 3173 } 3174 elsif ( $dbkey=~s/\s+\(VG\)$//o ) { 3175 #$qualifier="video_game"; 3176 delete($movies{$key}); 3177 next; 3178 } 3179 else { 3180 $qualifier="movie"; 3181 } 3182 #if ( $dbkey=~s/\s+\((tv_series|tv_mini_series|tv_movie|video_movie|video_game)\)$//o ) { 3183 # $qualifier=$1; 3184 #} 3185 my $year; 3186 my $title=$dbkey; 3187 3188 if ( $title=~m/^\"/o && $title=~m/\"\s*\(/o ) { #" 3189 $title=~s/^\"//o; #" 3190 $title=~s/\"(\s*\()/$1/o; #" 3191 } 3192 3193 if ( $title=~s/\s+\((\d\d\d\d)\)$//o || 3194 $title=~s/\s+\((\d\d\d\d)\/[IVX]+\)$//o ) { 3195 $year=$1; 3196 } 3197 elsif ( $title=~s/\s+\((\?\?\?\?)\)$//o || 3198 $title=~s/\s+\((\?\?\?\?)\/[IVX]+\)$//o ) { 3199 $year="0000"; 3200 } 3201 else { 3202 $self->error("movie list format failed to decode year from title '$title'"); 3203 $year="0000"; 3204 } 3205 $title=~s/(.*),\s*(The|A|Une|Las|Les|Los|L\'|Le|La|El|Das|De|Het|Een)$/$2 $1/og; 3206 3207 my $hashkey=lc("$title ($year)"); 3208 $hashkey=~s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/oeg; 3209 3210 if ( defined($movies{$hashkey}) ) { 3211 die "unable to place moviedb key for $key, report to xmltv-devel\@lists.sf.net"; 3212 } 3213 die "title \"$title\" contains a tab" if ( $title=~m/\t/o ); 3214 #print "key:$dbkey\n\ttitle=$title\n\tyear=$year\n\tqualifier=$qualifier\n"; 3215 #print "key $key: value=\"$movies{$key}\"\n"; 3216 3217 $nmovies{$hashkey}=$dbkey.$tab.$year.$tab.$qualifier.$tab.delete($movies{$key}); 3218 $count++; 3219 3220 if ($self->{showProgressBar}) { 3221 # re-adjust target so progress bar doesn't seem too wonky 3222 if ( $count > $countEstimate ) { 3223 $countEstimate = $progress->target($count+100); 3224 $next_update=$progress->update($count); 3225 } 3226 elsif ( $count > $next_update ) { 3227 $next_update=$progress->update($count); 3228 } 3229 } 3230 } 3231 $progress->update($countEstimate) if ($self->{showProgressBar}); 3232 3233 if ( scalar(keys %movies) != 0 ) { 3234 die "what happened, we have keys left ?"; 3235 } 3236 undef(%movies); 3237 } 3238 3239 { 3240 my $countEstimate=$self->dbinfoGet("db_stat_movie_count", 0); 3241 my $progress=Term::ProgressBar->new({name => "writing database", 3242 count => $countEstimate, 3243 ETA => 'linear'}) 3244 if ($self->{showProgressBar}); 3245 $progress->minor(0) if ($self->{showProgressBar}); 3246 $progress->max_update_rate(1) if ($self->{showProgressBar}); 3247 my $next_update=0; 3248 3249 open(IDX, "> $self->{moviedbIndex}") || die "$self->{moviedbIndex}:$!"; 3250 open(DAT, "> $self->{moviedbData}") || die "$self->{moviedbData}:$!"; 3251 my $count=0; 3252 for my $key (sort {$a cmp $b} keys %nmovies) { 3253 my $val=delete($nmovies{$key}); 3254 #print "movie $key: $val\n"; 3255 #$val=~s/^([^\t]+)\t([^\t]+)\t([^\t]+)\t//o || die "internal failure ($key:$val)"; 3256 my ($dbkey, $year, $qualifier,$directors,$actors,@rest)=split('\t', $val); 3257 #die ("no 1") if ( !defined($dbkey)); 3258 #die ("no 2") if ( !defined($year)); 3259 #die ("no 3") if ( !defined($qualifier)); 3260 #die ("no 4") if ( !defined($directors)); 3261 #die ("no 5") if ( !defined($actors)); 3262 #print "key:$key\n\ttitle=$dbkey\n\tyear=$year\n\tqualifier=$qualifier\n"; 3263 3264 #my ($directors, $actors)=split('\t', $val); 3265 3266 my $details=""; 3267 3268 if ( $directors eq "<>" ) { 3269 $details.="<>"; 3270 } 3271 else { 3272 # sort directors by last name, removing duplicates 3273 my $last=''; 3274 for my $name (sort {$a cmp $b} split('\|', $directors)) { 3275 if ( $name ne $last ) { 3276 $details.="$name|"; 3277 $last=$name; 3278 } 3279 } 3280 $details=~s/\|$//o; 3281 } 3282 3283 #print " $dbkey: $val\n"; 3284 if ( $actors eq "<>" ) { 3285 $details.=$tab."<>"; 3286 } 3287 else { 3288 $details.=$tab; 3289 3290 # sort actors by billing, removing repeated entries 3291 # be warned, two actors may have the same billing level 3292 my $last=''; 3293 for my $c (sort {$a cmp $b} split('\|', $actors)) { 3294 my ($billing, $name)=split(':', $c); 3295 # remove Host/Narrators from end 3296 # BUG - should remove (I)'s from actors/actresses names when details are generated 3297 $name=~s/\s\([IVX]+\)\[/\[/o; 3298 $name=~s/\s\([IVX]+\)$//o; 3299 3300 if ( $name ne $last ) { 3301 $details.="$name|"; 3302 $last=$name; 3303 } 3304 #print " $c: split gives'$billing' and '$name'\n"; 3305 } 3306 $details=~s/\|$//o; 3307 } 3308 $count++; 3309 my $lineno=sprintf("%07d", $count); 3310 print IDX $key."\t".$dbkey."\t".$year."\t".$qualifier."\t".$lineno."\n"; 3311 print DAT $lineno.":".$details."\t".join($tab, @rest)."\n"; 3312 3313 if ($self->{showProgressBar}) { 3314 # re-adjust target so progress bar doesn't seem too wonky 3315 if ( $count > $countEstimate ) { 3316 $countEstimate = $progress->target($count+100); 3317 $next_update=$progress->update($count); 3318 } 3319 elsif ( $count > $next_update ) { 3320 $next_update=$progress->update($count); 3321 } 3322 } 3323 } 3324 $progress->update($countEstimate) if ($self->{showProgressBar}); 3325 close(DAT); 3326 close(IDX); 3327 } 3328 3329 $self->dbinfoAdd("db_version", $XMLTV::IMDB::VERSION); 3330 3331 if ( $self->dbinfoSave() ) { 3332 $self->error("$self->{moviedbInfo}:$!"); 3333 return(1); 3334 } 3335 3336 $self->status("running quick sanity check on database indexes..."); 3337 my $imdb=new XMLTV::IMDB('imdbDir' => $self->{imdbDir}, 3338 'verbose' => $self->{verbose}); 3339 3340 if ( -e "$self->{moviedbOffline}" ) { 3341 unlink("$self->{moviedbOffline}"); 3342 } 3343 3344 if ( my $errline=$imdb->sanityCheckDatabase() ) { 3345 open(OFF, "> $self->{moviedbOffline}") || die "$self->{moviedbOffline}:$!"; 3346 print OFF $errline."\n"; 3347 print OFF "one of the prep stages' must have produced corrupt data\n"; 3348 print OFF "report the following details to xmltv-devel\@lists.sf.net\n"; 3349 3350 my $info=XMLTV::IMDB::loadDBInfo($self->{moviedbInfo}); 3351 if ( ref $info eq 'SCALAR' ) { 3352 print OFF "\tdbinfo file corrupt\n"; 3353 print OFF "\t$info"; 3354 } 3355 else { 3356 for my $key (sort keys %{$info}) { 3357 print OFF "\t$key:$info->{$key}\n"; 3358 } 3359 } 3360 print OFF "database taken offline\n"; 3361 close(OFF); 3362 open(OFF, "< $self->{moviedbOffline}") || die "$self->{moviedbOffline}:$!"; 3363 while(<OFF>) { 3364 chop(); 3365 $self->error($_); 3366 } 3367 close(OFF); 3368 return(1); 3369 } 3370 $self->status("sanity intact :)"); 3371 } 3372 else { 3373 $self->error("tv_imdb: invalid stage $stage: only 1-".$self->{stageLast}." are valid"); 3374 return(1); 3375 } 3376 3377 $self->dbinfoAdd("seconds_to_complete_prep_stage_$stage", (time()-$startTime)); 3378 if ( $self->dbinfoSave() ) { 3379 $self->error("$self->{moviedbInfo}:$!"); 3380 return(1); 3381 } 3382 return(0); 3383} 3384 3385sub crunchStage($$) 3386{ 3387 my ($self, $stage)=@_; 3388 3389 if ( $stage == $self->{stageLast} ) { 3390 # check all the pre-requisite stages have been run 3391 for (my $st=1 ; $st < $self->{stageLast}; $st++ ) { 3392 if ( !$self->stageComplete($st) ) { 3393 #$self->error("prep stages must be run in sequence.."); 3394 $self->error("prepStage $st either has never been run or failed"); 3395 if ( grep { $_ == $st } values %{$self->{optionalStages}} ) { 3396 $self->error("data for this stage will NOT be added"); 3397 } else { 3398 $self->error("rerun tv_imdb with --prepStage=$st"); 3399 return(1); 3400 } 3401 } 3402 } 3403 } 3404 3405 if ( -f "$self->{moviedbInfo}" && $stage != 1 ) { 3406 my $ret=$self->dbinfoLoad(); 3407 if ( $ret ) { 3408 $self->error($ret); 3409 return(1); 3410 } 3411 } 3412 3413 $self->redirect("$self->{imdbDir}/stage$stage.log") || return(1); 3414 my $ret=$self->invokeStage($stage); 3415 $self->redirect(undef); 3416 3417 if ( $ret == 0 ) { 3418 if ( $self->{errorCountInLog} == 0 ) { 3419 $self->status("prep stage $stage succeeded with no errors"); 3420 } 3421 else { 3422 $self->status("prep stage $stage succeeded with $self->{errorCountInLog} errors in $self->{imdbDir}/stage$stage.log"); 3423 if ( $stage == $self->{stageLast} && $self->{errorCountInLog} > 30 && $self->{errorCountInLog} < 80 ) { 3424 $self->status("this stage commonly produces around 60 (or so) warnings because of imdb"); 3425 $self->status("list file inconsistancies, they can usually be safely ignored"); 3426 } 3427 } 3428 } 3429 else { 3430 if ( $self->{errorCountInLog} == 0 ) { 3431 $self->status("prep stage $stage failed (with no logged errors)"); 3432 } 3433 else { 3434 $self->status("prep stage $stage failed with $self->{errorCountInLog} errors in $self->{imdbDir}/stage$stage.log"); 3435 } 3436 } 3437 return($ret); 3438} 3439 34401; 3441