1#!/usr/bin/perl -w 2# 3# webcollage, Copyright © 1999-2019 by Jamie Zawinski <jwz@jwz.org> 4# This program decorates the screen with random images from the web. 5# One satisfied customer described it as "a nonstop pop culture brainbath." 6# 7# Permission to use, copy, modify, distribute, and sell this software and its 8# documentation for any purpose is hereby granted without fee, provided that 9# the above copyright notice appear in all copies and that both that 10# copyright notice and this permission notice appear in supporting 11# documentation. No representations are made about the suitability of this 12# software for any purpose. It is provided "as is" without express or 13# implied warranty. 14 15 16# To run this as a display mode with xscreensaver, add this to `programs': 17# 18# webcollage --root 19# webcollage --root --filter 'vidwhacker --stdin --stdout' 20# 21# 22# You can see this in action at https://www.jwz.org/webcollage/ -- 23# it auto-reloads about once a minute. To make a page similar to 24# that on your own system, do this: 25# 26# webcollage --size '800x600' --imagemap $HOME/www/webcollage/index 27# 28# 29# Requires that either the provided "webcollage-helper" program or 30# ImageMagick's "convert" be available on $PATH. 31# 32# 33# If you have the "driftnet" program installed, webcollage can display a 34# collage of images sniffed off your local ethernet, instead of pulled out 35# of search engines: in that way, your screensaver can display the images 36# that your co-workers are downloading! 37# 38# Driftnet is available here: http://www.ex-parrot.com/~chris/driftnet/ 39# Use it like so: 40# 41# webcollage --root --driftnet 42# 43# Driftnet is the Unix implementation of the MacOS "EtherPEG" program. 44 45 46require 5; 47use strict; 48 49# We can't "use diagnostics" here, because that library malfunctions if 50# you signal and catch alarms: it says "Uncaught exception from user code" 51# and exits, even though I damned well AM catching it! 52#use diagnostics; 53 54 55require Time::Local; 56require POSIX; 57use Fcntl ':flock'; # import LOCK_* constants 58use POSIX qw(strftime); 59use LWP::UserAgent; 60 61 62my $progname = $0; $progname =~ s@.*/@@g; 63my ($version) = ('$Revision: 1.183 $' =~ m/\s(\d[.\d]+)\s/s); 64my $copyright = "WebCollage $version, Copyright (c) 1999-2017" . 65 " Jamie Zawinski <jwz\@jwz.org>\n" . 66 " https://www.jwz.org/webcollage/\n"; 67 68 69 70my @search_methods = ( 71 # Google is rate-limiting us now, so this works ok from 72 # a short-running screen saver, but not as a batch job. 73 # I haven't found a workaround. 74 # 75 5, "googlephotos", \&pick_from_google_image_photos, 76 3, "googleimgs", \&pick_from_google_images, 77 3, "googlenums", \&pick_from_google_image_numbers, 78 79 # So let's try Bing instead. No rate limiting yet! 80 # 81 13, "bingphotos", \&pick_from_bing_image_photos, 82 11, "bingimgs", \&pick_from_bing_images, 83 10, "bingnums", \&pick_from_bing_image_numbers, 84 85 20, "flickr_recent", \&pick_from_flickr_recent, 86 15, "flickr_random", \&pick_from_flickr_random, 87 6, "livejournal", \&pick_from_livejournal_images, 88 89 11, "imgur", \&pick_from_imgur, 90 91 # Tumblr doesn't have an "or" search, so this isn't great. 92 3, "tumblr", \&pick_from_tumblr, 93 94 # I ran out of usable access tokens, May 2017 95 # 0, "instagram", \&pick_from_instagram, 96 97 # No longer exists, as of Apr 2014 98 # 0, "yahoorand", \&pick_from_yahoo_random_link, 99 100 # Twitter destroyed their whole API in 2013. 101 # 0, "twitpic", \&pick_from_twitpic_images, 102 # 0, "twitter", \&pick_from_twitter_images, 103 104 # This is a cute way to search for a certain webcams. 105 # Not included in default methods, since these images 106 # aren't terribly interesting by themselves. 107 # See also "SurveillanceSaver". 108 # 109 0, "securitycam", \&pick_from_security_camera, 110 111 # Nonfunctional as of June 2011. 112 # 0, "altavista", \&pick_from_alta_vista_random_link, 113 114 # In Apr 2002, Google asked me to stop searching them. 115 # I asked them to add a "random link" url. They said 116 # "that would be easy, we'll think about it" and then 117 # never wrote back. Booo Google! Booooo! So, screw 118 # those turkeys, I've turned Google searching back on. 119 # I'm sure they can take it. (Jan 2005.) 120 121 # Jan 2005: Yahoo fucked up their search form so that 122 # it's no longer possible to do "or" searches on news 123 # images, so we rarely get any hits there any more. 124 # 125 # 0, "yahoonews", \&pick_from_yahoo_news_text, 126 127 # Dec 2004: the ircimages guy's server can't take the 128 # heat, so he started banning the webcollage user agent. 129 # I tried to convince him to add a lighter-weight page to 130 # support webcollage better, but he doesn't care. 131 # 132 # 0, "ircimages", \&pick_from_ircimages, 133 134 # Dec 2002: Alta Vista has a new "random link" URL now. 135 # They added it specifically to better support webcollage! 136 # That was super cool of them. This is how we used to do 137 # it, before: 138 # 139 # 0, "avimages", \&pick_from_alta_vista_images, 140 # 0, "avtext", \&pick_from_alta_vista_text, 141 142 # This broke in 2004. Eh, Lycos sucks anyway. 143 # 144 # 0, "lycos", \&pick_from_lycos_text, 145 146 # This broke in 2003, I think. I suspect Hotbot is 147 # actually the same search engine data as Lycos. 148 # 149 # 0, "hotbot", \&pick_from_hotbot_text, 150 ); 151 152# programs we can use to write to the root window (tried in ascending order.) 153# 154my @root_displayers = ( 155 "xscreensaver-getimage -root -file", 156 "chbg -once -xscreensaver -max_size 100", 157 "xv -root -quit -viewonly +noresetroot -quick24 -rmode 5" . 158 " -rfg black -rbg black", 159 "xli -quiet -onroot -center -border black", 160 "xloadimage -quiet -onroot -center -border black", 161 162# this lame program wasn't built with vroot.h: 163# "xsri -scale -keep-aspect -center-horizontal -center-vertical", 164); 165 166 167# Some sites need cookies to work properly. These are they. 168# 169my %cookies = ( 170 "www.altavista.com" => "AV_ALL=1", # request uncensored searches 171 "web.altavista.com" => "AV_ALL=1", 172 "ircimages.com" => 'disclaimer=1', 173); 174 175 176# If this is set, it's a helper program to use for pasting images together: 177# this is somewhat faster than using ImageMagick. 178# 179my $webcollage_helper = undef; 180my $convert_cmd = 'convert'; 181 182my $opacity = 0.85; # Opacity when pasting images together. 183 184 185# Some sites have managed to poison the search engines. These are they. 186# (We auto-detect sites that have poisoned the search engines via excessive 187# keywords or dictionary words, but these are ones that slip through 188# anyway.) 189# 190# This can contain full host names, or 2 or 3 component domains. 191# 192my %poisoners = ( 193 "die.net" => 1, # 'l33t h4ck3r d00dz. 194 "genforum.genealogy.com" => 1, # Cluttering avtext with human names. 195 "rootsweb.com" => 1, # Cluttering avtext with human names. 196 "akamai.net" => 1, # Lots of sites have their images on Akamai. 197 "akamaitech.net" => 1, # But those are pretty much all banners. 198 # Since Akamai is super-expensive, let's 199 # go out on a limb and assume that all of 200 # their customers are rich-and-boring. 201 "bartleby.com" => 1, # Dictionary, cluttering avtext. 202 "encyclopedia.com" => 1, # Dictionary, cluttering avtext. 203 "onlinedictionary.datasegment.com" => 1, # Dictionary, cluttering avtext. 204 "hotlinkpics.com" => 1, # Porn site that has poisoned avimages 205 # (I don't see how they did it, though!) 206 "alwayshotels.com" => 1, # Poisoned Lycos pretty heavily. 207 "nextag.com" => 1, # Poisoned Alta Vista real good. 208 "ghettodriveby.com" => 1, # Poisoned Google Images. 209 "crosswordsolver.org" => 1, # Poisoned Google Images. 210 "xona.com" => 1, # Poisoned Google Images. 211 "freepatentsonline.com" => 1, # Poisoned Google Images. 212 "herbdatanz.com" => 1, # Poisoned Google Images. 213); 214 215 216# When verbosity is turned on, we warn about sites that we seem to be hitting 217# a lot: usually this means some new poisoner has made it into the search 218# engines. But sometimes, the warning is just because that site has a lot 219# of stuff on it. So these are the sites that are immune to the "frequent 220# site" diagnostic message. 221# 222my %warningless_sites = ( 223 "home.earthlink.net" => 1, 224 "www.angelfire.com" => 1, 225 "members.aol.com" => 1, 226 "img.photobucket.com" => 1, 227 "pics.livejournal.com" => 1, 228 "tinypic.com" => 1, 229 "flickr.com" => 1, 230 "staticflickr.com" => 1, 231 "live.staticflickr.com" => 1, 232 "pbase.com" => 1, 233 "blogger.com" => 1, 234 "multiply.com" => 1, 235 "wikimedia.org" => 1, 236 "twitpic.com" => 1, 237 "amazonaws.com" => 1, 238 "blogspot.com" => 1, 239 "photoshelter.com" => 1, 240 "myspacecdn.com" => 1, 241 "feedburner.com" => 1, 242 "wikia.com" => 1, 243 "ljplus.ru" => 1, 244 "yandex.ru" => 1, 245 "imgur.com" => 1, 246 "tumblr.com" => 1, 247 "yfrog.com" => 1, 248 "cdninstagram.com" => 1, 249 "gstatic.com" => 1, 250 251 "yimg.com" => 1, # This is where dailynews.yahoo.com stores 252 "eimg.com" => 1, # its images, so pick_from_yahoo_news_text() 253 # hits this every time. 254 255 "images.quizfarm.com" => 1, # damn those LJ quizzes... 256 "images.quizilla.com" => 1, 257 "images.quizdiva.net" => 1, 258 259 "driftnet" => 1, # builtin... 260 "local-directory" => 1, # builtin... 261); 262 263 264# For decoding HTML-encoded character entities to URLs. 265# In This Modern World, probably we should use HTML::Entities instead. 266# 267my %entity_table = ( 268 "apos" => '\'', 269 "quot" => '"', "amp" => '&', "lt" => '<', 270 "gt" => '>', "nbsp" => ' ', "iexcl" => '', 271 "cent" => "\xA2", "pound" => "\xA3", "curren" => "\xA4", 272 "yen" => "\xA5", "brvbar" => "\xA6", "sect" => "\xA7", 273 "uml" => "\xA8", "copy" => "\xA9", "ordf" => "\xAA", 274 "laquo" => "\xAB", "not" => "\xAC", "shy" => "\xAD", 275 "reg" => "\xAE", "macr" => "\xAF", "deg" => "\xB0", 276 "plusmn" => "\xB1", "sup2" => "\xB2", "sup3" => "\xB3", 277 "acute" => "\xB4", "micro" => "\xB5", "para" => "\xB6", 278 "middot" => "\xB7", "cedil" => "\xB8", "sup1" => "\xB9", 279 "ordm" => "\xBA", "raquo" => "\xBB", "frac14" => "\xBC", 280 "frac12" => "\xBD", "frac34" => "\xBE", "iquest" => "\xBF", 281 "Agrave" => "\xC0", "Aacute" => "\xC1", "Acirc" => "\xC2", 282 "Atilde" => "\xC3", "Auml" => "\xC4", "Aring" => "\xC5", 283 "AElig" => "\xC6", "Ccedil" => "\xC7", "Egrave" => "\xC8", 284 "Eacute" => "\xC9", "Ecirc" => "\xCA", "Euml" => "\xCB", 285 "Igrave" => "\xCC", "Iacute" => "\xCD", "Icirc" => "\xCE", 286 "Iuml" => "\xCF", "ETH" => "\xD0", "Ntilde" => "\xD1", 287 "Ograve" => "\xD2", "Oacute" => "\xD3", "Ocirc" => "\xD4", 288 "Otilde" => "\xD5", "Ouml" => "\xD6", "times" => "\xD7", 289 "Oslash" => "\xD8", "Ugrave" => "\xD9", "Uacute" => "\xDA", 290 "Ucirc" => "\xDB", "Uuml" => "\xDC", "Yacute" => "\xDD", 291 "THORN" => "\xDE", "szlig" => "\xDF", "agrave" => "\xE0", 292 "aacute" => "\xE1", "acirc" => "\xE2", "atilde" => "\xE3", 293 "auml" => "\xE4", "aring" => "\xE5", "aelig" => "\xE6", 294 "ccedil" => "\xE7", "egrave" => "\xE8", "eacute" => "\xE9", 295 "ecirc" => "\xEA", "euml" => "\xEB", "igrave" => "\xEC", 296 "iacute" => "\xED", "icirc" => "\xEE", "iuml" => "\xEF", 297 "eth" => "\xF0", "ntilde" => "\xF1", "ograve" => "\xF2", 298 "oacute" => "\xF3", "ocirc" => "\xF4", "otilde" => "\xF5", 299 "ouml" => "\xF6", "divide" => "\xF7", "oslash" => "\xF8", 300 "ugrave" => "\xF9", "uacute" => "\xFA", "ucirc" => "\xFB", 301 "uuml" => "\xFC", "yacute" => "\xFD", "thorn" => "\xFE", 302 "yuml" => "\xFF", 303 304 # HTML 4 entities that do not have 1:1 Latin1 mappings. 305 "bull" => "*", "hellip"=> "...", "prime" => "'", "Prime" => "\"", 306 "frasl" => "/", "trade" => "[tm]", "larr" => "<-", "rarr" => "->", 307 "harr" => "<->", "lArr" => "<=", "rArr" => "=>", "hArr" => "<=>", 308 "empty" => "\xD8", "minus" => "-", "lowast"=> "*", "sim" => "~", 309 "cong" => "=~", "asymp" => "~", "ne" => "!=", "equiv" => "==", 310 "le" => "<=", "ge" => ">=", "lang" => "<", "rang" => ">", 311 "loz" => "<>", "OElig" => "OE", "oelig" => "oe", "Yuml" => "Y", 312 "circ" => "^", "tilde" => "~", "ensp" => " ", "emsp" => " ", 313 "thinsp"=> " ", "ndash" => "-", "mdash" => "--", "lsquo" => "`", 314 "rsquo" => "'", "sbquo" => "'", "ldquo" => "\"", "rdquo" => "\"", 315 "bdquo" => "\"", "lsaquo"=> "<", "rsaquo"=> ">", 316); 317 318 319############################################################################## 320# 321# Various global flags set by command line parameters, or computed 322# 323############################################################################## 324 325 326my $current_state = "???"; # for diagnostics 327my $load_method; 328my $last_search; 329my $image_succeeded = -1; 330my $suppress_audit = 0; 331 332my $verbose_imgmap = 0; # print out rectangles and URLs only (stdout) 333my $verbose_warnings = 0; # print out warnings when things go wrong 334my $verbose_load = 0; # diagnostics about loading of URLs 335my $verbose_filter = 0; # diagnostics about page selection/rejection 336my $verbose_net = 0; # diagnostics about network I/O 337my $verbose_decode = 0; # diagnostics about img conversion pipelines 338my $verbose_http = 0; # diagnostics about all HTTP activity 339my $verbose_exec = 0; # diagnostics about executing programs 340 341my $report_performance_interval = 60 * 15; # print some stats every 15 minutes 342 343my $http_proxy = undef; 344my $http_timeout = 20; 345my $cvt_timeout = 10; 346 347my $min_width = 50; 348my $min_height = 50; 349my $min_ratio = 1/5; 350 351my $min_gif_area = (120 * 120); 352 353 354my $no_output_p = 0; 355my $urls_only_p = 0; 356my $cocoa_p = 0; 357my $imagemap_base = undef; 358 359my @pids_to_kill = (); # forked pids we should kill when we exit, if any. 360 361my $driftnet_magic = 'driftnet'; 362my $driftnet_dir = undef; 363my $default_driftnet_cmd = "driftnet -a -m 100"; 364 365my $local_magic = 'local-directory'; 366my $local_dir = undef; 367 368my $wordlist; 369 370my %rejected_urls; 371my @tripwire_words = ("aberrate", "abode", "amorphous", "antioch", 372 "arrhenius", "arteriole", "blanket", "brainchild", 373 "burdensome", "carnival", "cherub", "chord", "clever", 374 "dedicate", "dilogarithm", "dolan", "dryden", 375 "eggplant"); 376 377 378############################################################################## 379# 380# Retrieving URLs 381# 382############################################################################## 383 384# returns three values: the HTTP response line; the document headers; 385# and the document body. 386# 387sub get_document_1($$$) { 388 my ($url, $referer, $timeout) = @_; 389 390 if (!defined($timeout)) { $timeout = $http_timeout; } 391 if ($timeout > $http_timeout) { $timeout = $http_timeout; } 392 393 my $user_agent = "$progname/$version"; 394 395 if ($url =~ m@^https?://www\.altavista\.com/@s || 396 $url =~ m@^https?://random\.yahoo\.com/@s || 397 $url =~ m@^https?://[^./]+\.google\.com/@s || 398 $url =~ m@^https?://www\.livejournal\.com/@s) { 399 # block this, you turkeys. 400 $user_agent = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.7)' . 401 ' Gecko/20070914 Firefox/2.0.0.7'; 402 } 403 404 my $ua = LWP::UserAgent->new ( agent => $user_agent, 405 keep_alive => 0, 406 env_proxy => 0, 407 ); 408 $ua->proxy ('http', $http_proxy) if $http_proxy; 409 $ua->default_header ('Referer' => $referer) if $referer; 410 $ua->default_header ('Accept' => '*/*'); 411 $ua->timeout($timeout) if $timeout; 412 413 if (0) { 414 $ua->add_handler ("request_send", 415 sub($$$) { 416 my ($req, $ua, $h) = @_; 417 print "\n>>[[\n"; $req->dump; print "\n]]\n"; 418 return; 419 }); 420 $ua->add_handler ("response_data", 421 sub($$$$) { 422 my ($req, $ua, $h, $data) = @_; 423 #print "\n<<[[\n"; print $data; print "\n]]\n"; 424 return 1; 425 }); 426 $ua->add_handler ("request_done", 427 sub($$$) { 428 my ($req, $ua, $h) = @_; 429 print "\n<<[[\n"; $req->dump; print "\n]]\n"; 430 return; 431 }); 432 } 433 434 if ($verbose_http) { 435 LOG (1, " ==> GET $url"); 436 LOG (1, " ==> User-Agent: $user_agent"); 437 LOG (1, " ==> Referer: $referer") if $referer; 438 } 439 440 my $res = $ua->get ($url); 441 442 my $http = ($res ? $res->status_line : '') || ''; 443 my $head = ($res ? $res->headers() : '') || ''; 444 $head = $head->as_string() if $head; 445 my $body = ($res && $res->is_success ? $res->decoded_content : '') || ''; 446 447 LOG ($verbose_net, "get_document_1 $url " . ($referer ? $referer : "")); 448 449 $head =~ s/\r\n/\n/gs; 450 $head =~ s/\r/\n/gs; 451 if ($verbose_http) { 452 foreach (split (/\n/, $head)) { 453 LOG ($verbose_http, " <== $_"); 454 } 455 } 456 457 my @L = split(/\r\n|\r|\n/, $body); 458 my $lines = @L; 459 LOG ($verbose_http, 460 " <== [ body ]: $lines lines, " . length($body) . " bytes"); 461 462 if (!$http) { 463 LOG (($verbose_net || $verbose_load), "null response: $url"); 464 return (); 465 } 466 467 return ( $http, $head, $body ); 468} 469 470 471# returns two values: the document headers; and the document body. 472# if the given URL did a redirect, returns the redirected-to document. 473# 474sub get_document($$;$) { 475 my ($url, $referer, $timeout) = @_; 476 my $start = time; 477 478 if (defined($referer) && $referer eq $driftnet_magic) { 479 return get_driftnet_file ($url); 480 } 481 482 if (defined($referer) && $referer eq $local_magic) { 483 return get_local_file ($url); 484 } 485 486 my $orig_url = $url; 487 my $loop_count = 0; 488 my $max_loop_count = 4; 489 490 do { 491 if (defined($timeout) && $timeout <= 0) { 492 LOG (($verbose_net || $verbose_load), "timed out for $url"); 493 $suppress_audit = 1; 494 return (); 495 } 496 497 my ( $http, $head, $body ) = get_document_1 ($url, $referer, $timeout); 498 499 if (defined ($timeout)) { 500 my $now = time; 501 my $elapsed = $now - $start; 502 $timeout -= $elapsed; 503 $start = $now; 504 } 505 506 return () unless $http; # error message already printed 507 508 $http =~ s/[\r\n]+$//s; 509 510 if ( $http =~ m@^HTTP/[0-9.]+ 30[123]@ ) { 511 $_ = $head; 512 513 my ( $location ) = m@^location:[ \t]*(.*)$@im; 514 if ( $location ) { 515 $location =~ s/[\r\n]$//; 516 517 LOG ($verbose_net, "redirect from $url to $location"); 518 $referer = $url; 519 $url = $location; 520 521 if ($url =~ m@^/@) { 522 $referer =~ m@^(https?://[^/]+)@i; 523 $url = $1 . $url; 524 } elsif (! ($url =~ m@^[a-z]+:@i)) { 525 $_ = $referer; 526 s@[^/]+$@@g if m@^https?://[^/]+/@i; 527 $_ .= "/" if m@^https?://[^/]+$@i; 528 $url = $_ . $url; 529 } 530 531 } else { 532 LOG ($verbose_net, "no Location with \"$http\""); 533 return ( $url, $body ); 534 } 535 536 if ($loop_count++ > $max_loop_count) { 537 LOG ($verbose_net, 538 "too many redirects ($max_loop_count) from $orig_url"); 539 $body = undef; 540 return (); 541 } 542 543 } elsif ( $http =~ m@^HTTP/[0-9.]+ ([4-9][0-9][0-9].*)$@ ) { 544 545 LOG (($verbose_net || $verbose_load), "failed: $1 ($url)"); 546 547 # http errors -- return nothing. 548 $body = undef; 549 return (); 550 551 } elsif (!$body) { 552 553 LOG (($verbose_net || $verbose_load), "document contains no data: $url"); 554 return (); 555 556 } else { 557 558 # ok! 559 return ( $url, $body ); 560 } 561 562 } while (1); 563} 564 565# If we already have a cookie defined for this site, and the site is trying 566# to overwrite that very same cookie, let it do so. This is because nytimes 567# expires its cookies - it lets you upgrade to a new cookie without logging 568# in again, but you have to present the old cookie to get the new cookie. 569# So, by doing this, the built-in cypherpunks cookie will never go "stale". 570# 571sub set_cookie($$) { 572 my ($host, $cookie) = @_; 573 my $oc = $cookies{$host}; 574 return unless $oc; 575 $_ = $oc; 576 my ($oc_name, $oc_value) = m@^([^= \t\r\n]+)=(.*)$@; 577 $_ = $cookie; 578 my ($nc_name, $nc_value) = m@^([^= \t\r\n]+)=(.*)$@; 579 580 if ($oc_name eq $nc_name && 581 $oc_value ne $nc_value) { 582 $cookies{$host} = $cookie; 583 LOG ($verbose_net, "overwrote ${host}'s $oc_name cookie"); 584 } 585} 586 587 588############################################################################ 589# 590# Extracting image URLs from HTML 591# 592############################################################################ 593 594# given a URL and the body text at that URL, selects and returns a random 595# image from it. returns () if no suitable images found. 596# 597sub pick_image_from_body($$) { 598 my ($url, $body) = @_; 599 600 my $base = $url; 601 $_ = $url; 602 603 # if there's at least one slash after the host, take off the last 604 # pathname component 605 if ( m@^https?://[^/]+/@io ) { 606 $base =~ s@[^/]+$@@go; 607 } 608 609 # if there are no slashes after the host at all, put one on the end. 610 if ( m@^https?://[^/]+$@io ) { 611 $base .= "/"; 612 } 613 614 $_ = $body; 615 616 # strip out newlines, compress whitespace 617 s/[\r\n\t ]+/ /go; 618 619 # nuke comments 620 s/<!--.*?-->//go; 621 622 623 # There are certain web sites that list huge numbers of dictionary 624 # words in their bodies or in their <META NAME=KEYWORDS> tags (surprise! 625 # Porn sites tend not to be reputable!) 626 # 627 # I do not want webcollage to filter on content: I want it to select 628 # randomly from the set of images on the web. All the logic here for 629 # rejecting some images is really a set of heuristics for rejecting 630 # images that are not really images: for rejecting *text* that is in 631 # GIF/JPEG/PNG form. I don't want text, I want pictures, and I want 632 # the content of the pictures to be randomly selected from among all 633 # the available content. 634 # 635 # So, filtering out "dirty" pictures by looking for "dirty" keywords 636 # would be wrong: dirty pictures exist, like it or not, so webcollage 637 # should be able to select them. 638 # 639 # However, picking a random URL is a hard thing to do. The mechanism I'm 640 # using is to search for a selection of random words. This is not 641 # perfect, but works ok most of the time. The way it breaks down is when 642 # some URLs get precedence because their pages list *every word* as 643 # related -- those URLs come up more often than others. 644 # 645 # So, after we've retrieved a URL, if it has too many keywords, reject 646 # it. We reject it not on the basis of what those keywords are, but on 647 # the basis that by having so many, the page has gotten an unfair 648 # advantage against our randomizer. 649 # 650 my $trip_count = 0; 651 foreach my $trip (@tripwire_words) { 652 $trip_count++ if m/$trip/i; 653 } 654 655 if ($trip_count >= $#tripwire_words - 2) { 656 LOG (($verbose_filter || $verbose_load), 657 "there is probably a dictionary in \"$url\": rejecting."); 658 $rejected_urls{$url} = -1; 659 $body = undef; 660 $_ = undef; 661 return (); 662 } 663 664 665 my @urls; 666 my %unique_urls; 667 668 foreach (split(/ *</)) { 669 if ( m/^meta.*["']keywords["']/i ) { 670 671 # Likewise, reject any web pages that have a KEYWORDS meta tag 672 # that is too long. 673 # 674 my $L = length($_); 675 if ($L > 1000) { 676 LOG (($verbose_filter || $verbose_load), 677 "excessive keywords ($L bytes) in $url: rejecting."); 678 $rejected_urls{$url} = $L; 679 $body = undef; 680 $_ = undef; 681 return (); 682 } else { 683 LOG ($verbose_filter, " keywords ($L bytes) in $url (ok)"); 684 } 685 686 } elsif (m/^ (IMG|A) \b .* (SRC|HREF) \s* = \s* ["']? (.*?) [ "'<>] /six || 687 m/^ (LINK|META) \b .* (REL|PROPERTY) \s* = \s* 688 ["']? (image_src|og:image) ["']? /six) { 689 690 my $was_inline = (lc($1) eq 'img'); 691 my $was_meta = (lc($1) eq 'link' || lc($1) eq 'meta'); 692 my $link = $3; 693 694 # For <link rel="image_src" href="..."> 695 # and <meta property="og:image" content="..."> 696 # 697 if ($was_meta) { 698 next unless (m/ (HREF|CONTENT) \s* = \s* ["']? (.*?) [ "'<>] /six); 699 $link = $2; 700 } 701 702 my ( $width ) = m/width ?=[ \"]*(\d+)/oi; 703 my ( $height ) = m/height ?=[ \"]*(\d+)/oi; 704 $_ = $link; 705 706 if ( m@^/@o ) { 707 my $site; 708 ( $site = $base ) =~ s@^(https?://[^/]*).*@$1@gio; 709 $_ = "$site$link"; 710 } elsif ( ! m@^[^/:?]+:@ ) { 711 $_ = "$base$link"; 712 s@/\./@/@g; 713 1 while (s@/[^/]+/\.\./@/@g); 714 } 715 716 # skip non-http 717 if ( ! m@^https?://@io ) { 718 next; 719 } 720 721 # skip non-image 722 if ( ! m@[.](gif|jpg|jpeg|pjpg|pjpeg|png)$@io ) { 723 next; 724 } 725 726 # skip really short or really narrow images 727 if ( $width && $width < $min_width) { 728 if (!$height) { $height = "?"; } 729 LOG ($verbose_filter, " skip narrow image $_ (${width}x$height)"); 730 next; 731 } 732 733 if ( $height && $height < $min_height) { 734 if (!$width) { $width = "?"; } 735 LOG ($verbose_filter, " skip short image $_ (${width}x$height)"); 736 next; 737 } 738 739 # skip images with ratios that make them look like banners. 740 if ($min_ratio && $width && $height && 741 ($width * $min_ratio ) > $height) { 742 if (!$height) { $height = "?"; } 743 LOG ($verbose_filter, " skip bad ratio $_ (${width}x$height)"); 744 next; 745 } 746 747 # skip GIFs with a small number of pixels -- those usually suck. 748 if ($width && $height && 749 m/\.gif$/io && 750 ($width * $height) < $min_gif_area) { 751 LOG ($verbose_filter, " skip small GIF $_ (${width}x$height)"); 752 next; 753 } 754 755 # skip images with a URL that indicates a Yahoo thumbnail. 756 if (m@\.yimg\.com/.*/t/@) { 757 if (!$width) { $width = "?"; } 758 if (!$height) { $height = "?"; } 759 LOG ($verbose_filter, " skip yahoo thumb $_ (${width}x$height)"); 760 next; 761 } 762 763 my $url = $_; 764 765 if ($unique_urls{$url}) { 766 LOG ($verbose_filter, " skip duplicate image $_"); 767 next; 768 } 769 770 LOG ($verbose_filter, 771 " image $url" . 772 ($width && $height ? " (${width}x${height})" : "") . 773 ($was_meta ? " (meta)" : $was_inline ? " (inline)" : "")); 774 775 776 my $weight = 1; 777 778 if ($was_meta) { 779 $weight = 20; # meta tag images are far preferable to inline images. 780 } else { 781 if ($url !~ m@[.](gif|png)$@io ) { 782 $weight += 2; # JPEGs are preferable to GIFs and PNGs. 783 } 784 if (! $was_inline) { 785 $weight += 4; # pointers to images are preferable to inlined images. 786 } 787 } 788 789 $unique_urls{$url}++; 790 for (my $i = 0; $i < $weight; $i++) { 791 $urls[++$#urls] = $url; 792 } 793 } 794 } 795 796 my $fsp = ($body =~ m@<frameset@i); 797 798 $_ = undef; 799 $body = undef; 800 801 @urls = depoison (@urls); 802 803 if ( $#urls < 0 ) { 804 LOG ($verbose_load, "no images on $base" . ($fsp ? " (frameset)" : "")); 805 return (); 806 } 807 808 # pick a random element of the table 809 my $i = int(rand($#urls+1)); 810 $url = $urls[$i]; 811 812 LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#urls+1) . ": $url"); 813 814 return $url; 815} 816 817 818# Given a URL and the RSS feed from that URL, pick a random image from 819# the feed. This is a lot simpler than extracting images out of a page: 820# we already know we have reasonable images, so we just pick one. 821# Returns: the real URL of the page (preferably not the RSS version), 822# and the image. 823 824sub pick_image_from_rss($$) { 825 my ($url, $body) = @_; 826 827 my ($base) = ($body =~ m@<link>([^<>]+)</link>@si); # root link 828 829 my @items = ($body =~ m@<item\b[^<>]*>(.*?)</item>@gsi); 830 return unless @items; 831 832 my $n = @items; 833 my $i = int(rand($n)); 834 my $item = $items[$i]; 835 836 $base = $1 if ($item =~ m@<link>([^<>]+)</link>@si); # item link 837 $base = $url unless $base; 838 839 ($url) = ($item =~ m/<enclosure\b[^<>]*\burl="(.*?)"/si); 840 return unless $url; 841 842 LOG ($verbose_load, "picked image $i/$n: $url"); 843 return ($base, $url); 844} 845 846 847############################################################################ 848# 849# Subroutines for getting pages and images out of search engines 850# 851############################################################################ 852 853 854sub pick_dictionary() { 855 my @dicts = ("/usr/dict/words", 856 "/usr/share/dict/words", 857 "/usr/share/lib/dict/words", 858 "/usr/share/dict/cracklib-small", 859 "/usr/share/dict/cracklib-words" 860 ); 861 foreach my $f (@dicts) { 862 if (-f $f) { 863 $wordlist = $f; 864 last; 865 } 866 } 867 error ("$dicts[0] does not exist") unless defined($wordlist); 868} 869 870# returns a random word from the dictionary 871# 872sub random_word() { 873 874 return undef unless open (my $in, '<', $wordlist); 875 876 my $size = (stat($in))[7]; 877 my $word = undef; 878 my $count = 0; 879 880 while (1) { 881 error ("looping ($count) while reading $wordlist") 882 if (++$count > 100); 883 884 my $pos = int (rand ($size)); 885 if (seek ($in, $pos, 0)) { 886 $word = <$in>; # toss partial line 887 $word = <$in>; # keep next line 888 } 889 890 next unless ($word); 891 next if ($word =~ m/^[-\']/); 892 893 $word = lc($word); 894 $word =~ s/^.*-//s; 895 $word =~ s/^[^a-z]+//s; 896 $word =~ s/[^a-z]+$//s; 897 $word =~ s/\'s$//s; 898 $word =~ s/ys$/y/s; 899 $word =~ s/ally$//s; 900 $word =~ s/ly$//s; 901 $word =~ s/ies$/y/s; 902 $word =~ s/ally$/al/s; 903 $word =~ s/izes$/ize/s; 904 $word =~ s/esses$/ess/s; 905 $word =~ s/(.{5})ing$/$1/s; 906 907 next if (length ($word) > 14); 908 last if ($word); 909 } 910 911 close ($in); 912 913 if ( $word =~ s/\s/\+/gs ) { # convert intra-word spaces to "+". 914 $word = "\%22$word\%22"; # And put quotes (%22) around it. 915 } 916 917 return $word; 918} 919 920 921sub random_words($) { 922 my ($sep) = @_; 923 return (random_word() . $sep . 924 random_word() . $sep . 925 random_word() . $sep . 926 random_word() . $sep . 927 random_word()); 928} 929 930 931sub url_quote($) { 932 my ($s) = @_; 933 $s =~ s|([^-a-zA-Z0-9.\@/_\r\n])|sprintf("%%%02X", ord($1))|ge; 934 return $s; 935} 936 937sub url_unquote($) { 938 my ($s) = @_; 939 $s =~ s/[+]/ /g; 940 $s =~ s/%([a-z0-9]{2})/chr(hex($1))/ige; 941 return $s; 942} 943 944sub html_quote($) { 945 my ($s) = @_; 946 $s =~ s/&/&/gi; 947 $s =~ s/</</gi; 948 $s =~ s/>/>/gi; 949 $s =~ s/\"/"/gi; 950 return $s; 951} 952 953sub html_unquote($) { 954 my ($s) = @_; 955 $s =~ s/(&([a-z]+);)/{ $entity_table{$2} || $1; }/gexi; # e.g., ' 956 $s =~ s/(&\#(\d+);)/{ chr($2) }/gexi; # e.g., ' 957 return $s; 958} 959 960 961# Loads the given URL (a search on some search engine) and returns: 962# - the total number of hits the search engine claimed it had; 963# - a list of URLs from the page that the search engine returned; 964# Note that this list contains all kinds of internal search engine 965# junk URLs too -- caller must prune them. 966# 967sub pick_from_search_engine($$$) { 968 my ( $timeout, $search_url, $words ) = @_; 969 970 $_ = $words; 971 s/%20/ /g; 972 973 print STDERR "\n\n" if ($verbose_load); 974 975 LOG ($verbose_load, "words: $_"); 976 LOG ($verbose_load, "URL: $search_url"); 977 978 $last_search = $search_url; # for warnings 979 980 my $start = time; 981 my ( $base, $body ) = get_document ($search_url, undef, $timeout); 982 if (defined ($timeout)) { 983 $timeout -= (time - $start); 984 if ($timeout <= 0) { 985 $body = undef; 986 LOG (($verbose_net || $verbose_load), 987 "timed out (late) for $search_url"); 988 $suppress_audit = 1; 989 return (); 990 } 991 } 992 993 return () if (! $body); 994 995 996 my @subpages; 997 998 my $search_count = "?"; 999 if ($body =~ m@found (approximately |about )?(<B>)?(\d+)(</B>)? image@) { 1000 $search_count = $3; 1001 } elsif ($body =~ m@<NOBR>((\d{1,3})(,\d{3})*) @i) { 1002 $search_count = $1; 1003 } elsif ($body =~ m@found ((\d{1,3})(,\d{3})*|\d+) Web p@) { 1004 $search_count = $1; 1005 } elsif ($body =~ m@found about ((\d{1,3})(,\d{3})*|\d+) results@) { 1006 $search_count = $1; 1007 } elsif ($body =~ m@\b\d+ - \d+ of (\d+)\b@i) { # avimages 1008 $search_count = $1; 1009 } elsif ($body =~ m@About ((\d{1,3})(,\d{3})*) images@i) { # avimages 1010 $search_count = $1; 1011 } elsif ($body =~ m@We found ((\d{1,3})(,\d{3})*|\d+) results@i) { # *vista 1012 $search_count = $1; 1013 } elsif ($body =~ m@ of about <B>((\d{1,3})(,\d{3})*)<@i) { # googleimages 1014 $search_count = $1; 1015 } elsif ($body =~ m@<B>((\d{1,3})(,\d{3})*)</B> Web sites were found@i) { 1016 $search_count = $1; # lycos 1017 } elsif ($body =~ m@WEB.*?RESULTS.*?\b((\d{1,3})(,\d{3})*)\b.*?Matches@i) { 1018 $search_count = $1; # hotbot 1019 } elsif ($body =~ m@no photos were found containing@i) { # avimages 1020 $search_count = "0"; 1021 } elsif ($body =~ m@found no document matching@i) { # avtext 1022 $search_count = "0"; 1023 } 1024 1 while ($search_count =~ s/^(\d+)(\d{3})/$1,$2/); 1025 1026# if ($search_count eq "?" || $search_count eq "0") { 1027# my $file = "/tmp/wc.html"; 1028# open (my $out, '>', $file) || error ("writing $file: $!"); 1029# print $out $body; 1030# close $out; 1031# print STDERR blurb() . "###### wrote $file\n"; 1032# } 1033 1034 1035 my $length = length($body); 1036 my $href_count = 0; 1037 1038 $_ = $body; 1039 1040 s/[\r\n\t ]+/ /g; 1041 1042 1043 s/(<A )/\n$1/gi; 1044 foreach (split(/\n/)) { 1045 $href_count++; 1046 my ($u) = m@<A\s.*?\bHREF\s*=\s*([\"\'][^\"\'<>]+)@i; 1047 next unless $u; 1048 my ($u2) = m@<IMG\s.*\bSRC\s*=\s*[\"\']([^\"\'<>]+)@i; 1049 1050 if (m/\bm="\{(.*?)\}"/s) { # Bing info is inside JSON crud 1051 my $json = html_unquote($1); 1052 my ($href) = ($json =~ m/\b(?:surl|purl)\"?:\s*"(.*?)"/s); 1053 my ($img) = ($json =~ m/\b(?:imgurl|murl)\"?:\s*"(.*?)"/s); 1054 $u = "$img\t$href" if ($img && $href); 1055 1056 } elsif ($u2 && $u2 =~ m@://[^/]*\.gstatic\.com/@s) { $u = $u2; 1057 $u =~ s/^\"|\"$//s; 1058 1059 } elsif ($u =~ m/^\"([^\"]*)\"/) { $u = $1 # quoted string 1060 } elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; # or token 1061 } 1062 1063 if ( $rejected_urls{$u} ) { 1064 LOG ($verbose_filter, " pre-rejecting candidate: $u"); 1065 next; 1066 } 1067 1068 LOG ($verbose_http, " HREF: $u"); 1069 1070 $subpages[++$#subpages] = $u; 1071 } 1072 1073 if ( $#subpages < 0 ) { 1074 LOG ($verbose_filter, 1075 "found nothing on $base ($length bytes, $href_count links)."); 1076 return (); 1077 } 1078 1079 LOG ($verbose_filter, "" . $#subpages+1 . " links on $search_url"); 1080 1081 return ($search_count, @subpages); 1082} 1083 1084 1085sub depoison(@) { 1086 my (@urls) = @_; 1087 my @urls2 = (); 1088 foreach (@urls) { 1089 my ($h) = m@^https?://([^/: \t\r\n]+)@i; 1090 1091 next unless defined($h); 1092 1093 if ($poisoners{$h}) { 1094 LOG (($verbose_filter), " rejecting poisoner: $_"); 1095 next; 1096 } 1097 if ($h =~ m@([^.]+\.[^.]+\.[^.]+)$@ && 1098 $poisoners{$1}) { 1099 LOG (($verbose_filter), " rejecting poisoner: $_"); 1100 next; 1101 } 1102 if ($h =~ m@([^.]+\.[^.]+)$@ && 1103 $poisoners{$1}) { 1104 LOG (($verbose_filter), " rejecting poisoner: $_"); 1105 next; 1106 } 1107 1108 push @urls2, $_; 1109 } 1110 return @urls2; 1111} 1112 1113 1114# given a list of URLs, picks one at random; loads it; and returns a 1115# random image from it. 1116# returns the url of the page loaded; the url of the image chosen. 1117# 1118sub pick_image_from_pages($$$$@) { 1119 my ($base, $total_hit_count, $unfiltered_link_count, $timeout, @pages) = @_; 1120 1121 $total_hit_count = "?" unless defined($total_hit_count); 1122 1123 @pages = depoison (@pages); 1124 LOG ($verbose_load, 1125 "" . ($#pages+1) . " candidates of $unfiltered_link_count links" . 1126 " ($total_hit_count total)"); 1127 1128 return () if ($#pages < 0); 1129 1130 my $i = int(rand($#pages+1)); 1131 my $page = $pages[$i]; 1132 1133 LOG ($verbose_load, "picked page $page"); 1134 1135 $suppress_audit = 1; 1136 1137 my ( $base2, $body2 ) = get_document ($page, $base, $timeout); 1138 1139 if (!$base2 || !$body2) { 1140 $body2 = undef; 1141 return (); 1142 } 1143 1144 my $img = pick_image_from_body ($base2, $body2); 1145 $body2 = undef; 1146 1147 if ($img) { 1148 return ($base2, $img); 1149 } else { 1150 return (); 1151 } 1152} 1153 1154 1155############################################################################# 1156## 1157## Pick images from random pages returned by the Yahoo Random Link 1158## 1159############################################################################# 1160# 1161## yahoorand 1162#my $yahoo_random_link = "http://random.yahoo.com/fast/ryl"; 1163# 1164# 1165# Picks a random page; picks a random image on that page; 1166# returns two URLs: the page containing the image, and the image. 1167# Returns () if nothing found this time. 1168# 1169#sub pick_from_yahoo_random_link($) { 1170# my ($timeout) = @_; 1171# 1172# print STDERR "\n\n" if ($verbose_load); 1173# LOG ($verbose_load, "URL: $yahoo_random_link"); 1174# 1175# $last_search = $yahoo_random_link; # for warnings 1176# 1177# $suppress_audit = 1; 1178# 1179# my ( $base, $body ) = get_document ($yahoo_random_link, undef, $timeout); 1180# if (!$base || !$body) { 1181# $body = undef; 1182# return; 1183# } 1184# 1185# LOG ($verbose_load, "redirected to: $base"); 1186# 1187# my $img = pick_image_from_body ($base, $body); 1188# $body = undef; 1189# 1190# if ($img) { 1191# return ($base, $img); 1192# } else { 1193# return (); 1194# } 1195#} 1196 1197 1198############################################################################ 1199# 1200# Pick images from random pages returned by the Alta Vista Random Link 1201# Note: this seems to have gotten a *lot* less random lately (2007). 1202# 1203############################################################################ 1204 1205# altavista 1206my $alta_vista_random_link = "http://www.altavista.com/image/randomlink"; 1207 1208 1209# Picks a random page; picks a random image on that page; 1210# returns two URLs: the page containing the image, and the image. 1211# Returns () if nothing found this time. 1212# 1213sub pick_from_alta_vista_random_link($) { 1214 my ($timeout) = @_; 1215 1216 print STDERR "\n\n" if ($verbose_load); 1217 LOG ($verbose_load, "URL: $alta_vista_random_link"); 1218 1219 $last_search = $alta_vista_random_link; # for warnings 1220 1221 $suppress_audit = 1; 1222 1223 my ( $base, $body ) = get_document ($alta_vista_random_link, 1224 undef, $timeout); 1225 if (!$base || !$body) { 1226 $body = undef; 1227 return; 1228 } 1229 1230 LOG ($verbose_load, "redirected to: $base"); 1231 1232 my $img = pick_image_from_body ($base, $body); 1233 $body = undef; 1234 1235 if ($img) { 1236 return ($base, $img); 1237 } else { 1238 return (); 1239 } 1240} 1241 1242 1243############################################################################ 1244# 1245# Pick images by feeding random words into Alta Vista Image Search 1246# 1247############################################################################ 1248 1249 1250my $alta_vista_images_url = "http://www.altavista.com/image/results" . 1251 "?ipht=1" . # photos 1252 "&igrph=1" . # graphics 1253 "&iclr=1" . # color 1254 "&ibw=1" . # b&w 1255 "&micat=1" . # no partner sites 1256 "&sc=on" . # "site collapse" 1257 "&q="; 1258 1259# avimages 1260sub pick_from_alta_vista_images($) { 1261 my ($timeout) = @_; 1262 1263 my $words = random_word(); 1264 my $page = (int(rand(9)) + 1); 1265 my $search_url = $alta_vista_images_url . $words; 1266 1267 if ($page > 1) { 1268 $search_url .= "&pgno=" . $page; # page number 1269 $search_url .= "&stq=" . (($page-1) * 12); # first hit result on page 1270 } 1271 1272 my ($search_hit_count, @subpages) = 1273 pick_from_search_engine ($timeout, $search_url, $words); 1274 1275 my @candidates = (); 1276 foreach my $u (@subpages) { 1277 1278 # avimages is encoding their URLs now. 1279 next unless ($u =~ s/^.*\*\*(http%3a.*$)/$1/gsi); 1280 $u = url_unquote($u); 1281 1282 next unless ($u =~ m@^https?://@i); # skip non-HTTP or relative URLs 1283 next if ($u =~ m@[/.]altavista\.com\b@i); # skip altavista builtins 1284 next if ($u =~ m@[/.]yahoo\.com\b@i); # yahoo and av in cahoots? 1285 next if ($u =~ m@[/.]doubleclick\.net\b@i); # you cretins 1286 next if ($u =~ m@[/.]clicktomarket\.com\b@i); # more cretins 1287 1288 next if ($u =~ m@[/.]viewimages\.com\b@i); # stacked deck 1289 next if ($u =~ m@[/.]gettyimages\.com\b@i); 1290 1291 LOG ($verbose_filter, " candidate: $u"); 1292 push @candidates, $u; 1293 } 1294 1295 return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1, 1296 $timeout, @candidates); 1297} 1298 1299 1300 1301############################################################################ 1302# 1303# Pick images from Aptix security cameras 1304# Cribbed liberally from google image search code. 1305# By Jason Sullivan <jasonsul@us.ibm.com> 1306# 1307############################################################################ 1308 1309my $aptix_images_url = ("http://www.google.com/search" . 1310 "?q=inurl:%22jpg/image.jpg%3Fr%3D%22"); 1311 1312# securitycam 1313sub pick_from_security_camera($) { 1314 my ($timeout) = @_; 1315 1316 my $page = (int(rand(9)) + 1); 1317 my $num = 20; # 20 images per page 1318 my $search_url = $aptix_images_url; 1319 1320 if ($page > 1) { 1321 $search_url .= "&start=" . $page*$num; # page number 1322 $search_url .= "&num=" . $num; #images per page 1323 } 1324 1325 my ($search_hit_count, @subpages) = 1326 pick_from_search_engine ($timeout, $search_url, ''); 1327 1328 my @candidates = (); 1329 my %referers; 1330 foreach my $u (@subpages) { 1331 next if ($u =~ m@[/.]google\.com\b@i); # skip google builtins (most links) 1332 next unless ($u =~ m@jpg/image.jpg\?r=@i); # All pics contain this 1333 1334 LOG ($verbose_filter, " candidate: $u"); 1335 push @candidates, $u; 1336 $referers{$u} = $u; 1337 } 1338 1339 @candidates = depoison (@candidates); 1340 return () if ($#candidates < 0); 1341 my $i = int(rand($#candidates+1)); 1342 my $img = $candidates[$i]; 1343 my $ref = $referers{$img}; 1344 1345 LOG ($verbose_load, "picked image " . ($i+1) . ": $img (on $ref)"); 1346 return ($ref, $img); 1347} 1348 1349 1350############################################################################ 1351# 1352# Pick images by feeding random words into Google Image Search. 1353# By Charles Gales <gales@us.ibm.com> 1354# 1355############################################################################ 1356 1357my $google_images_url = 'https://www.google.com/search' . 1358 '?source=lnms&tbm=isch&tbs=isz:l&q='; 1359 1360# googleimgs 1361sub pick_from_google_images($;$$) { 1362 my ($timeout, $words, $max_page) = @_; 1363 1364 if (!defined($words)) { 1365 $words = random_word(); # only one word for Google 1366 } 1367 1368 my $off = int(rand(40)); 1369 my $search_url = $google_images_url . $words . "&start=" . $off; 1370 1371 my ($search_hit_count, @subpages) = 1372 pick_from_search_engine ($timeout, $search_url, $words); 1373 1374 my @candidates = (); 1375 foreach my $u (@subpages) { 1376 $u = html_unquote($u); 1377 # next if ($u =~ m@^https?://[^.]*\.(google|youtube)\.com/@s); 1378 next unless ($u =~ m@^https?://[^/]*\.gstatic\.com@s); 1379 LOG ($verbose_filter, " candidate: $u"); 1380 push @candidates, $u; 1381 } 1382 1383 @candidates = depoison (@candidates); 1384 return () if ($#candidates < 0); 1385 my $i = int(rand($#candidates+1)); 1386 my $img = $candidates[$i]; 1387 1388 LOG ($verbose_load, "picked image " . ($i+1) . ": $img"); 1389 return ($img, $img); 1390} 1391 1392 1393 1394############################################################################ 1395# 1396# Pick images by feeding random numbers into Google Image Search. 1397# By jwz, suggested by Ian O'Donnell. 1398# 1399############################################################################ 1400 1401 1402# googlenums 1403sub pick_from_google_image_numbers($) { 1404 my ($timeout) = @_; 1405 1406 my $max = 9999; 1407 my $number = int(rand($max)); 1408 1409 $number = sprintf("%04d", $number) 1410 if (rand() < 0.3); 1411 1412 pick_from_google_images ($timeout, "$number"); 1413} 1414 1415 1416 1417############################################################################ 1418# 1419# Pick images by feeding random digital camera file names into 1420# Google Image Search. 1421# By jwz, inspired by the excellent Random Personal Picture Finder 1422# at http://www.diddly.com/random/ 1423# May 2017: Commented out a bunch of formats that have fallen out of favor. 1424# 1425############################################################################ 1426 1427my @photomakers = ( 1428 # 1429 # Common digital camera file name formats, as described at 1430 # http://www.diddly.com/random/about.html 1431 # 1432# sub { sprintf ("dcp%05d.jpg", int(rand(4000))); }, # Kodak 1433 sub { sprintf ("dsc%05d.jpg", int(rand(4000))); }, # Nikon 1434 sub { sprintf ("dscn%04d.jpg", int(rand(4000))); }, # Nikon 1435# sub { sprintf ("mvc-%03d.jpg", int(rand(999))); }, # Sony Mavica 1436# sub { sprintf ("mvc%05d.jpg", int(rand(9999))); }, # Sony Mavica 1437# sub { sprintf ("P101%04d.jpg", int(rand(9999))); }, # Olympus w/ date=101 1438# sub { sprintf ("P%x%02d%04d.jpg", # Olympus 1439# int(rand(0xC)), int(rand(30))+1, 1440# rand(9999)); }, 1441 sub { sprintf ("IMG_%03d.jpg", int(rand(999))); }, # ? 1442# sub { sprintf ("IMAG%04d.jpg", int(rand(9999))); }, # RCA and Samsung 1443# sub { my $n = int(rand(9999)); # Canon 1444# sprintf ("1%02d-%04d.jpg", int($n/100), $n); }, 1445# sub { my $n = int(rand(9999)); # Canon 1446# sprintf ("1%02d-%04d_IMG.jpg", 1447# int($n/100), $n); }, 1448 sub { sprintf ("IMG_%04d.jpg", int(rand(9999))); }, # Canon 1449 sub { sprintf ("dscf%04d.jpg", int(rand(9999))); }, # Fuji Finepix 1450# sub { sprintf ("pdrm%04d.jpg", int(rand(9999))); }, # Toshiba PDR 1451# sub { sprintf ("IM%06d.jpg", int(rand(9999))); }, # HP Photosmart 1452# sub { sprintf ("EX%06d.jpg", int(rand(9999))); }, # HP Photosmart 1453# sub { my $n = int(rand(3)); # Kodak DC-40,50,120 1454# sprintf ("DC%04d%s.jpg", int(rand(9999)), 1455# $n == 0 ? 'S' : $n == 1 ? 'M' : 'L'); }, 1456 sub { sprintf ("pict%04d.jpg", int(rand(9999))); }, # Minolta Dimage 1457# sub { sprintf ("P%07d.jpg", int(rand(9999))); }, # Kodak DC290 1458# sub { sprintf ("%02d%02d%04d.jpg", # Casio QV3000, QV4000 1459# int(rand(12))+1, int(rand(31))+1, 1460# int(rand(999))); }, 1461# sub { sprintf ("%02d%x%02d%04d.jpg", # Casio QV7000 1462# int(rand(6)), # year 1463# int(rand(12))+1, int(rand(31))+1, 1464# int(rand(999))); }, 1465 sub { sprintf ("IMGP%04d.jpg", int(rand(9999))); }, # Pentax Optio S 1466# sub { sprintf ("PANA%04d.jpg", int(rand(9999))); }, # Panasonic vid still 1467 sub { sprintf ("HPIM%04d.jpg", int(rand(9999))); }, # HP Photosmart 1468# sub { sprintf ("PCDV%04d.jpg", int(rand(9999))); }, # ? 1469 ); 1470 1471 1472# googlephotos 1473sub pick_from_google_image_photos($) { 1474 my ($timeout) = @_; 1475 1476 my $i = int(rand($#photomakers + 1)); 1477 my $fn = $photomakers[$i]; 1478 my $file = &$fn; 1479 #$file .= "%20filetype:jpg"; 1480 1481 pick_from_google_images ($timeout, $file); 1482} 1483 1484 1485############################################################################ 1486# 1487# Pick images by feeding random words into Google Image Search. 1488# By the way: fuck Microsoft. 1489# 1490############################################################################ 1491 1492my $bing_images_url = "http://www.bing.com/images/async?q="; 1493 1494 1495# bingimgs 1496sub pick_from_bing_images($;$$) { 1497 my ($timeout, $words, $max_page) = @_; 1498 1499 if (!defined($words)) { 1500 $words = random_word(); # only one word for Bing 1501 } 1502 1503 my $off = int(rand(300)); 1504 my $search_url = $bing_images_url . $words . "&first=" . $off; 1505 1506 my ($search_hit_count, @subpages) = 1507 pick_from_search_engine ($timeout, $search_url, $words); 1508 1509 my @candidates = (); 1510 my %referers; 1511 foreach my $u (@subpages) { 1512 my ($img, $ref) = ($u =~ m/^(.*?)\t(.*)$/s); 1513 next unless $img; 1514 LOG ($verbose_filter, " candidate: $ref"); 1515 push @candidates, $img; 1516 $referers{$img} = $ref; 1517 } 1518 1519 @candidates = depoison (@candidates); 1520 return () if ($#candidates < 0); 1521 my $i = int(rand($#candidates+1)); 1522 my $img = $candidates[$i]; 1523 my $ref = $referers{$img}; 1524 1525 LOG ($verbose_load, "picked image " . ($i+1) . ": $img (on $ref)"); 1526 return ($ref, $img); 1527} 1528 1529 1530 1531 1532############################################################################ 1533# 1534# Pick images by feeding random numbers into Bing Image Search. 1535# 1536############################################################################ 1537 1538# bingnums 1539sub pick_from_bing_image_numbers($) { 1540 my ($timeout) = @_; 1541 1542 my $max = 9999; 1543 my $number = int(rand($max)); 1544 1545 $number = sprintf("%04d", $number) 1546 if (rand() < 0.3); 1547 1548 pick_from_bing_images ($timeout, "$number"); 1549} 1550 1551 1552############################################################################ 1553# 1554# Pick images by feeding random numbers into Bing Image Search. 1555# 1556############################################################################ 1557 1558# bingphotos 1559sub pick_from_bing_image_photos($) { 1560 my ($timeout) = @_; 1561 1562 my $i = int(rand($#photomakers + 1)); 1563 my $fn = $photomakers[$i]; 1564 my $file = &$fn; 1565 1566 pick_from_bing_images ($timeout, $file); 1567} 1568 1569 1570############################################################################ 1571# 1572# Pick images by feeding random words into Alta Vista Text Search 1573# 1574############################################################################ 1575 1576 1577my $alta_vista_url = "http://www.altavista.com/web/results" . 1578 "?pg=aq" . 1579 "&aqmode=s" . 1580 "&filetype=html" . 1581 "&sc=on" . # "site collapse" 1582 "&nbq=50" . 1583 "&aqo="; 1584 1585# avtext 1586sub pick_from_alta_vista_text($) { 1587 my ($timeout) = @_; 1588 1589 my $words = random_words('%20'); 1590 my $page = (int(rand(9)) + 1); 1591 my $search_url = $alta_vista_url . $words; 1592 1593 if ($page > 1) { 1594 $search_url .= "&pgno=" . $page; 1595 $search_url .= "&stq=" . (($page-1) * 10); 1596 } 1597 1598 my ($search_hit_count, @subpages) = 1599 pick_from_search_engine ($timeout, $search_url, $words); 1600 1601 my @candidates = (); 1602 foreach my $u (@subpages) { 1603 1604 # Those altavista fuckers are playing really nasty redirection games 1605 # these days: the filter your clicks through their site, but use 1606 # onMouseOver to make it look like they're not! Well, it makes it 1607 # easier for us to identify search results... 1608 # 1609 next unless ($u =~ s/^.*\*\*(http%3a.*$)/$1/gsi); 1610 $u = url_unquote($u); 1611 1612 next unless ($u =~ m@^https?://@i); # skip non-HTTP or relative URLs 1613 next if ($u =~ m@[/.]altavista\.com\b@i); # skip altavista builtins 1614 next if ($u =~ m@[/.]yahoo\.com\b@i); # yahoo and av in cahoots? 1615 1616 LOG ($verbose_filter, " candidate: $u"); 1617 push @candidates, $u; 1618 } 1619 1620 return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1, 1621 $timeout, @candidates); 1622} 1623 1624 1625 1626############################################################################ 1627# 1628# Pick images by feeding random words into Hotbot 1629# 1630############################################################################ 1631 1632my $hotbot_search_url =("http://hotbot.lycos.com/default.asp" . 1633 "?ca=w" . 1634 "&descriptiontype=0" . 1635 "&imagetoggle=1" . 1636 "&matchmode=any" . 1637 "&nummod=2" . 1638 "&recordcount=50" . 1639 "&sitegroup=1" . 1640 "&stem=1" . 1641 "&cobrand=undefined" . 1642 "&query="); 1643 1644sub pick_from_hotbot_text($) { 1645 my ($timeout) = @_; 1646 1647 $last_search = $hotbot_search_url; # for warnings 1648 1649 # lycos seems to always give us back dictionaries and word lists if 1650 # we search for more than one word... 1651 # 1652 my $words = random_word(); 1653 1654 my $start = int(rand(8)) * 10 + 1; 1655 my $search_url = $hotbot_search_url . $words . "&first=$start&page=more"; 1656 1657 my ($search_hit_count, @subpages) = 1658 pick_from_search_engine ($timeout, $search_url, $words); 1659 1660 my @candidates = (); 1661 foreach my $u (@subpages) { 1662 1663 # Hotbot plays redirection games too 1664 # (not any more?) 1665# next unless ($u =~ m@/director.asp\?.*\btarget=([^&]+)@); 1666# $u = url_decode($1); 1667 1668 next unless ($u =~ m@^https?://@i); # skip non-HTTP or relative URLs 1669 next if ($u =~ m@[/.]hotbot\.com\b@i); # skip hotbot builtins 1670 next if ($u =~ m@[/.]lycos\.com\b@i); # skip hotbot builtins 1671 next if ($u =~ m@[/.]inktomi\.com\b@i); # skip hotbot builtins 1672 1673 LOG ($verbose_filter, " candidate: $u"); 1674 push @candidates, $u; 1675 } 1676 1677 return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1, 1678 $timeout, @candidates); 1679} 1680 1681 1682 1683############################################################################ 1684# 1685# Pick images by feeding random words into Lycos 1686# 1687############################################################################ 1688 1689my $lycos_search_url = "http://search.lycos.com/default.asp" . 1690 "?lpv=1" . 1691 "&loc=searchhp" . 1692 "&tab=web" . 1693 "&query="; 1694 1695sub pick_from_lycos_text($) { 1696 my ($timeout) = @_; 1697 1698 $last_search = $lycos_search_url; # for warnings 1699 1700 # lycos seems to always give us back dictionaries and word lists if 1701 # we search for more than one word... 1702 # 1703 my $words = random_word(); 1704 1705 my $start = int(rand(8)) * 10 + 1; 1706 my $search_url = $lycos_search_url . $words . "&first=$start&page=more"; 1707 1708 my ($search_hit_count, @subpages) = 1709 pick_from_search_engine ($timeout, $search_url, $words); 1710 1711 my @candidates = (); 1712 foreach my $u (@subpages) { 1713 1714 # Lycos plays redirection games. 1715 # (not any more?) 1716# next unless ($u =~ m@^https?://click.lycos.com/director.asp 1717# .* 1718# \btarget=([^&]+) 1719# .* 1720# @x); 1721# $u = url_decode($1); 1722 1723 next unless ($u =~ m@^https?://@i); # skip non-HTTP or relative URLs 1724 next if ($u =~ m@[/.]hotbot\.com\b@i); # skip lycos builtins 1725 next if ($u =~ m@[/.]lycos\.com\b@i); # skip lycos builtins 1726 next if ($u =~ m@[/.]terralycos\.com\b@i); # skip lycos builtins 1727 next if ($u =~ m@[/.]inktomi\.com\b@i); # skip lycos builtins 1728 1729 1730 LOG ($verbose_filter, " candidate: $u"); 1731 push @candidates, $u; 1732 } 1733 1734 return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1, 1735 $timeout, @candidates); 1736} 1737 1738 1739 1740############################################################################ 1741# 1742# Pick images by feeding random words into news.yahoo.com 1743# 1744############################################################################ 1745 1746my $yahoo_news_url = "http://news.search.yahoo.com/search/news" . 1747 "?c=news_photos" . 1748 "&p="; 1749 1750# yahoonews 1751sub pick_from_yahoo_news_text($) { 1752 my ($timeout) = @_; 1753 1754 $last_search = $yahoo_news_url; # for warnings 1755 1756 my $words = random_word(); 1757 my $search_url = $yahoo_news_url . $words; 1758 1759 my ($search_hit_count, @subpages) = 1760 pick_from_search_engine ($timeout, $search_url, $words); 1761 1762 my @candidates = (); 1763 foreach my $u (@subpages) { 1764 1765 # de-redirectize the URLs 1766 $u =~ s@^https?://rds\.yahoo\.com/.*-http%3A@http:@s; 1767 1768 # only accept URLs on Yahoo's news site 1769 next unless ($u =~ m@^https?://dailynews\.yahoo\.com/@i || 1770 $u =~ m@^https?://story\.news\.yahoo\.com/@i); 1771 next unless ($u =~ m@&u=/@); 1772 1773 LOG ($verbose_filter, " candidate: $u"); 1774 push @candidates, $u; 1775 } 1776 1777 return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1, 1778 $timeout, @candidates); 1779} 1780 1781 1782 1783############################################################################ 1784# 1785# Pick images from LiveJournal's list of recently-posted images. 1786# 1787############################################################################ 1788 1789my $livejournal_img_url = "http://www.livejournal.com/stats/latest-img.bml"; 1790 1791# With most of our image sources, we get a random page and then select 1792# from the images on it. However, in the case of LiveJournal, the page 1793# of images tends to update slowly; so we'll remember the last N entries 1794# on it and randomly select from those, to get a wider variety each time. 1795 1796my $lj_cache_size = 1000; 1797my @lj_cache = (); # fifo, for ordering by age 1798my %lj_cache = (); # hash, for detecting dups 1799 1800# livejournal 1801sub pick_from_livejournal_images($) { 1802 my ($timeout) = @_; 1803 1804 $last_search = $livejournal_img_url; # for warnings 1805 1806 my ( $base, $body ) = get_document ($livejournal_img_url, undef, $timeout); 1807 1808 # Often the document comes back empty. If so, just use the cache. 1809 # return () unless $body; 1810 $body = '' unless defined($body); 1811 1812 $body =~ s/\n/ /gs; 1813 $body =~ s/(<recent-image)\b/\n$1/gsi; 1814 1815 foreach (split (/\n/, $body)) { 1816 next unless (m/^<recent-image\b/); 1817 next unless (m/\bIMG=[\'\"]([^\'\"]+)[\'\"]/si); 1818 my $img = html_unquote ($1); 1819 1820 next if ($lj_cache{$img}); # already have it 1821 1822 next unless (m/\bURL=[\'\"]([^\'\"]+)[\'\"]/si); 1823 my $page = html_unquote ($1); 1824 my @pair = ($img, $page); 1825 LOG ($verbose_filter, " candidate: $img"); 1826 push @lj_cache, \@pair; 1827 $lj_cache{$img} = \@pair; 1828 } 1829 1830 return () if ($#lj_cache == -1); 1831 1832 my $n = $#lj_cache+1; 1833 my $i = int(rand($n)); 1834 my ($img, $page) = @{$lj_cache[$i]}; 1835 1836 # delete this one from @lj_cache and from %lj_cache. 1837 # 1838 @lj_cache = ( @lj_cache[0 .. $i-1], 1839 @lj_cache[$i+1 .. $#lj_cache] ); 1840 delete $lj_cache{$img}; 1841 1842 # Keep the size of the cache under the limit by nuking older entries 1843 # 1844 while ($#lj_cache >= $lj_cache_size) { 1845 my $pairP = shift @lj_cache; 1846 my $img = $pairP->[0]; 1847 delete $lj_cache{$img}; 1848 } 1849 1850 LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img"); 1851 1852 return ($page, $img); 1853} 1854 1855 1856############################################################################ 1857# 1858# Pick images from ircimages.com (images that have been in the /topic of 1859# various IRC channels.) 1860# 1861############################################################################ 1862 1863my $ircimages_url = "http://ircimages.com/"; 1864 1865# ircimages 1866sub pick_from_ircimages($) { 1867 my ($timeout) = @_; 1868 1869 $last_search = $ircimages_url; # for warnings 1870 1871 my $n = int(rand(2900)); 1872 my $search_url = $ircimages_url . "page-$n"; 1873 1874 my ( $base, $body ) = get_document ($search_url, undef, $timeout); 1875 return () unless $body; 1876 1877 my @candidates = (); 1878 1879 $body =~ s/\n/ /gs; 1880 $body =~ s/(<A)\b/\n$1/gsi; 1881 1882 foreach (split (/\n/, $body)) { 1883 1884 my ($u) = m@<A\s.*\bHREF\s*=\s*([^>]+)>@i; 1885 next unless $u; 1886 1887 if ($u =~ m/^\"([^\"]*)\"/) { $u = $1; } # quoted string 1888 elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; } # or token 1889 1890 next unless ($u =~ m/^https?:/i); 1891 next if ($u =~ m@^https?://(searchirc\.com\|ircimages\.com)@i); 1892 next unless ($u =~ m@[.](gif|jpg|jpeg|pjpg|pjpeg|png)$@i); 1893 1894 LOG ($verbose_http, " HREF: $u"); 1895 push @candidates, $u; 1896 } 1897 1898 LOG ($verbose_filter, "" . $#candidates+1 . " links on $search_url"); 1899 1900 return () if ($#candidates == -1); 1901 1902 my $i = int(rand($#candidates+1)); 1903 my $img = $candidates[$i]; 1904 1905 LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#candidates+1) . 1906 ": $img"); 1907 1908 $search_url = $img; # hmm... 1909 return ($search_url, $img); 1910} 1911 1912 1913############################################################################ 1914# 1915# Pick images from Twitpic's list of recently-posted images. 1916# 1917############################################################################ 1918 1919my $twitpic_img_url = "http://twitpic.com/public_timeline/feed.rss"; 1920 1921# With most of our image sources, we get a random page and then select 1922# from the images on it. However, in the case of Twitpic, the page 1923# of images tends to update slowly; so we'll remember the last N entries 1924# on it and randomly select from those, to get a wider variety each time. 1925 1926my $twitpic_cache_size = 1000; 1927my @twitpic_cache = (); # fifo, for ordering by age 1928my %twitpic_cache = (); # hash, for detecting dups 1929 1930# twitpic 1931sub pick_from_twitpic_images($) { 1932 my ($timeout) = @_; 1933 1934 $last_search = $twitpic_img_url; # for warnings 1935 1936 my ( $base, $body ) = get_document ($twitpic_img_url, undef, $timeout); 1937 1938 # Update the cache. 1939 1940 if ($body) { 1941 $body =~ s/\n/ /gs; 1942 $body =~ s/(<item)\b/\n$1/gsi; 1943 1944 my @items = split (/\n/, $body); 1945 shift @items; 1946 foreach (@items) { 1947 next unless (m@<link>([^<>]*)</link>@si); 1948 my $page = html_unquote ($1); 1949 1950 $page =~ s@/$@@s; 1951 $page .= '/full'; 1952 1953 next if ($twitpic_cache{$page}); # already have it 1954 1955 LOG ($verbose_filter, " candidate: $page"); 1956 push @twitpic_cache, $page; 1957 $twitpic_cache{$page} = $page; 1958 } 1959 } 1960 1961 # Pull from the cache. 1962 1963 return () if ($#twitpic_cache == -1); 1964 1965 my $n = $#twitpic_cache+1; 1966 my $i = int(rand($n)); 1967 my $page = $twitpic_cache[$i]; 1968 1969 # delete this one from @twitpic_cache and from %twitpic_cache. 1970 # 1971 @twitpic_cache = ( @twitpic_cache[0 .. $i-1], 1972 @twitpic_cache[$i+1 .. $#twitpic_cache] ); 1973 delete $twitpic_cache{$page}; 1974 1975 # Keep the size of the cache under the limit by nuking older entries 1976 # 1977 while ($#twitpic_cache >= $twitpic_cache_size) { 1978 my $page = shift @twitpic_cache; 1979 delete $twitpic_cache{$page}; 1980 } 1981 1982 ( $base, $body ) = get_document ($page, undef, $timeout); 1983 my $img = undef; 1984 $body = '' unless defined($body); 1985 1986 foreach (split (/<img\s+/, $body)) { 1987 my ($src) = m/\bsrc=[\"\'](.*?)[\"\']/si; 1988 next unless $src; 1989 next if m@/js/@s; 1990 next if m@/images/@s; 1991 1992 $img = $src; 1993 1994 $img = "http:$img" if ($img =~ m@^//@s); # Oh come on 1995 1996 # Sometimes these images are hosted on twitpic, sometimes on Amazon. 1997 if ($img =~ m@^/@) { 1998 $base =~ s@^(https?://[^/]+)/.*@$1@s; 1999 $img = $base . $img; 2000 } 2001 last; 2002 } 2003 2004 if (!$img) { 2005 LOG ($verbose_load, "no matching images on $page\n"); 2006 return (); 2007 } 2008 2009 LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img"); 2010 2011 return ($page, $img); 2012} 2013 2014 2015############################################################################ 2016# 2017# Pick images from Twitter's list of recently-posted updates. 2018# 2019############################################################################ 2020 2021# With most of our image sources, we get a random page and then select 2022# from the images on it. However, in the case of Twitter, the page 2023# of images only updates once a minute; so we'll remember the last N entries 2024# on it and randomly select from those, to get a wider variety each time. 2025 2026my $twitter_img_url = "http://api.twitter.com/1/statuses/" . 2027 "public_timeline.json" . 2028 "?include_entities=true" . 2029 "&include_rts=true" . 2030 "&count=200"; 2031 2032my $twitter_cache_size = 1000; 2033 2034my @twitter_cache = (); # fifo, for ordering by age 2035my %twitter_cache = (); # hash, for detecting dups 2036 2037 2038# twitter 2039sub pick_from_twitter_images($) { 2040 my ($timeout) = @_; 2041 2042 $last_search = $twitter_img_url; # for warnings 2043 2044 my ( $base, $body ) = get_document ($twitter_img_url, undef, $timeout); 2045 # Update the cache. 2046 2047 if ($body) { 2048 $body =~ s/[\r\n]+/ /gs; 2049 2050 # Parsing JSON is a pain in the ass. So we halfass it as usual. 2051 $body =~ s/^\[|\]$//s; 2052 $body =~ s/(\[.*?\])/{ $_ = $1; s@\},@\} @gs; $_; }/gsexi; 2053 my @items = split (/\},\{/, $body); 2054 foreach (@items) { 2055 my ($name) = m@"screen_name":"([^\"]+)"@si; 2056 my ($img) = m@"media_url":"([^\"]+)"@si; 2057 my ($page) = m@"display_url":"([^\"]+)"@si; 2058 next unless ($name && $img && $page); 2059 foreach ($img, $page) { 2060 s/\\//gs; 2061 $_ = "http://$_" unless (m/^http/si); 2062 } 2063 2064 next if ($twitter_cache{$page}); # already have it 2065 2066 LOG ($verbose_filter, " candidate: $page - $img"); 2067 push @twitter_cache, $page; 2068 $twitter_cache{$page} = $img; 2069 } 2070 } 2071 2072 # Pull from the cache. 2073 2074 return () if ($#twitter_cache == -1); 2075 2076 my $n = $#twitter_cache+1; 2077 my $i = int(rand($n)); 2078 my $page = $twitter_cache[$i]; 2079 my $url = $twitter_cache{$page}; 2080 2081 # delete this one from @twitter_cache and from %twitter_cache. 2082 # 2083 @twitter_cache = ( @twitter_cache[0 .. $i-1], 2084 @twitter_cache[$i+1 .. $#twitter_cache] ); 2085 delete $twitter_cache{$page}; 2086 2087 # Keep the size of the cache under the limit by nuking older entries 2088 # 2089 while ($#twitter_cache >= $twitter_cache_size) { 2090 my $page = shift @twitter_cache; 2091 delete $twitter_cache{$page}; 2092 } 2093 2094 LOG ($verbose_load, "picked page $url"); 2095 2096 $suppress_audit = 1; 2097 2098 return ($page, $url); 2099} 2100 2101 2102############################################################################ 2103# 2104# Pick images from Flickr's page of recently-posted photos. 2105# 2106############################################################################ 2107 2108my $flickr_img_url = "http://www.flickr.com/explore/"; 2109 2110# Like LiveJournal, the Flickr page of images tends to update slowly, 2111# so remember the last N entries on it and randomly select from those. 2112 2113# I know that Flickr has an API (http://www.flickr.com/services/api/) 2114# but it was easy enough to scrape the HTML, so I didn't bother exploring. 2115 2116my $flickr_cache_size = 1000; 2117my @flickr_cache = (); # fifo, for ordering by age 2118my %flickr_cache = (); # hash, for detecting dups 2119 2120 2121# flickr_recent 2122sub pick_from_flickr_recent($) { 2123 my ($timeout) = @_; 2124 2125 my $start = 16 * int(rand(100)); 2126 2127 $last_search = $flickr_img_url; # for warnings 2128 $last_search .= "?start=$start" if ($start > 0); 2129 2130 my ( $base, $body ) = get_document ($last_search, undef, $timeout); 2131 2132 # If the document comes back empty. just use the cache. 2133 # return () unless $body; 2134 $body = '' unless defined($body); 2135 2136 my $count = 0; 2137 my $count2 = 0; 2138 2139 if ($body =~ m@{ *"_data": \[ ( .*? \} ) \]@six) { 2140 $body = $1; 2141 } else { 2142 LOG ($verbose_load, "flickr unparsable: $last_search"); 2143 return (); 2144 } 2145 2146 $body =~ s/[\r\n]/ /gs; 2147 $body =~ s/(\},) *(\{)/$1\n$2/gs; # "_flickrModelRegistry" 2148 2149 foreach my $chunk (split (/\n/, $body)) { 2150 my ($img) = ($chunk =~ m@"displayUrl": *"(.*?)"@six); 2151 next unless defined ($img); 2152 $img =~ s/\\//gs; 2153 $img = "http:$img" unless ($img =~ m/^http/s); 2154 2155 my ($user) = ($chunk =~ m/"pathAlias": *"(.*?)"/si); 2156 next unless defined ($user); 2157 2158 my ($id) = ($img =~ m@/\d+/(\d+)_([\da-f]+)_@si); 2159 my ($page) = "https://www.flickr.com/photos/$user/$id/"; 2160 2161 # $img =~ s/_[a-z](\.[a-z\d]+)$/$1/si; # take off "thumb" suffix 2162 2163 $count++; 2164 next if ($flickr_cache{$img}); # already have it 2165 2166 my @pair = ($img, $page, $start); 2167 LOG ($verbose_filter, " candidate: $img"); 2168 push @flickr_cache, \@pair; 2169 $flickr_cache{$img} = \@pair; 2170 $count2++; 2171 } 2172 2173 return () if ($#flickr_cache == -1); 2174 2175 my $n = $#flickr_cache+1; 2176 my $i = int(rand($n)); 2177 my ($img, $page) = @{$flickr_cache[$i]}; 2178 2179 # delete this one from @flickr_cache and from %flickr_cache. 2180 # 2181 @flickr_cache = ( @flickr_cache[0 .. $i-1], 2182 @flickr_cache[$i+1 .. $#flickr_cache] ); 2183 delete $flickr_cache{$img}; 2184 2185 # Keep the size of the cache under the limit by nuking older entries 2186 # 2187 while ($#flickr_cache >= $flickr_cache_size) { 2188 my $pairP = shift @flickr_cache; 2189 my $img = $pairP->[0]; 2190 delete $flickr_cache{$img}; 2191 } 2192 2193 LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img"); 2194 2195 return ($page, $img); 2196} 2197 2198 2199############################################################################ 2200# 2201# Pick images from a random RSS feed on Flickr. 2202# 2203############################################################################ 2204 2205my $flickr_rss_base = ("http://www.flickr.com/services/feeds/" . 2206 "photos_public.gne" . 2207 "?format=rss_200_enc&tagmode=any&tags="); 2208 2209# Picks a random RSS feed; picks a random image from that feed; 2210# returns 2 URLs: the page containing the image, and the image. 2211# Mostly by Joe Mcmahon <mcmahon@yahoo-inc.com> 2212# 2213# flickr_random 2214sub pick_from_flickr_random($) { 2215 my $timeout = shift; 2216 2217 my $words = random_words(','); 2218 my $rss = $flickr_rss_base . $words; 2219 $last_search = $rss; 2220 2221 $_ = $words; 2222 s/,/ /g; 2223 2224 print STDERR "\n\n" if ($verbose_load); 2225 LOG ($verbose_load, "words: $_"); 2226 LOG ($verbose_load, "URL: $last_search"); 2227 2228 $suppress_audit = 1; 2229 2230 my ( $base, $body ) = get_document ($last_search, undef, $timeout); 2231 if (!$base || !$body) { 2232 $body = undef; 2233 return; 2234 } 2235 2236 my $img; 2237 ($base, $img) = pick_image_from_rss ($base, $body); 2238 $body = undef; 2239 return () unless defined ($img); 2240 2241 LOG ($verbose_load, "redirected to: $base"); 2242 return ($base, $img); 2243} 2244 2245 2246############################################################################ 2247# 2248# Pick random images from Instagram. 2249# 2250############################################################################ 2251 2252my $instagram_url_base = "https://api.instagram.com/v1/media/popular"; 2253 2254# instagram_random 2255sub pick_from_instagram($) { 2256 my $timeout = shift; 2257 2258 # Liberated access tokens. 2259 # jsdo.it search for: instagram client_id 2260 # Google search for: instagram "&client_id=" site:jsfiddle.net 2261 my @tokens = (#'b59fbe4563944b6c88cced13495c0f49', # gramfeed.com 2262 #'fa26679250df49c48a33fbcf30aae989', # instac.at 2263 #'d9494686198d4dfeb954979a3e270e5e', # iconosquare.com 2264 #'793ef48bb18e4197b61afce2d799b81c', # jsdo.it 2265 #'67b8a3e0073449bba70600d0fc68e6cb', # jsdo.it 2266 #'26a098e0df4d4b9ea8b4ce6c505b7742', # jsdo.it 2267 #'2437cbcd906a4c10940f990d283d3cd5', # jsdo.it 2268 #'191c7d7d5312464cbd92134f36ffdab5', # jsdo.it 2269 #'acfec809437b4340b2c38f66503af774', # jsdo.it 2270 #'e9f77604a3a24beba949c12d18130988', # jsdo.it 2271 #'2cd7bcf68ae346529770073d311575b3', # jsdo.it 2272 #'830c600fe8d742e2ab3f3b94f9bb22b7', # jsdo.it 2273 #'55865a0397ad41e5997dd95ef4df8da1', # jsdo.it 2274 #'192a5742f3644ea8bed1d25e439286a8', # jsdo.it 2275 #'38ed1477e7a44595861b8842cdb8ba23', # jsdo.it 2276 #'e52f79f645f54488ad0cc47f6f55ade6', # jsfiddle.net 2277 ); 2278 2279 my $tok = $tokens[int(rand($#tokens+1))]; 2280 $last_search = $instagram_url_base . "?client_id=" . $tok; 2281 2282 print STDERR "\n\n" if ($verbose_load); 2283 LOG ($verbose_load, "URL: $last_search"); 2284 2285 my ( $base, $body ) = get_document ($last_search, undef, $timeout); 2286 if (!$base || !$body) { 2287 $body = undef; 2288 return; 2289 } 2290 2291 $body =~ s/("link")/\001$1/gs; 2292 my @chunks = split(/\001/, $body); 2293 shift @chunks; 2294 my @urls = (); 2295 foreach (@chunks) { 2296 s/\\//gs; 2297 my ($url) = m/"link":\s*"(.*?)"/s; 2298 my ($img) = m/"standard_resolution":\{"url":\s*"(.*?)"/s; 2299 ($img) = m/"url":\s*"(.*?)"/s unless $url; 2300 next unless ($url && $img); 2301 push @urls, [ $url, $img ]; 2302 } 2303 2304 if ($#urls < 0) { 2305 LOG ($verbose_load, "no images on $last_search"); 2306 return (); 2307 } 2308 2309 my $i = int(rand($#urls+1)); 2310 my ($url, $img) = @{$urls[$i]}; 2311 2312 LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#urls+1) . ": $url"); 2313 return ($url, $img); 2314} 2315 2316 2317############################################################################ 2318# 2319# Pick images from Imgur. 2320# 2321############################################################################ 2322 2323my $imgur_base = 'http://imgur.com/search?qs=thumb&q_any='; 2324 2325sub pick_from_imgur($) { 2326 my $timeout = shift; 2327 2328 my $words = random_words('%20'); 2329 $last_search = $imgur_base . $words; 2330 2331 $_ = $words; 2332 s/%20/ /g; 2333 2334 print STDERR "\n\n" if ($verbose_load); 2335 LOG ($verbose_load, "words: $_"); 2336 LOG ($verbose_load, "URL: $last_search"); 2337 2338 $suppress_audit = 1; 2339 2340 my ( $base, $body ) = get_document ($last_search, undef, $timeout); 2341 if (!$base || !$body) { 2342 $body = undef; 2343 return; 2344 } 2345 2346 my @imgs = ($body =~ m@\bHREF=[\"\']([^\'\"<>]*/gallery/[^\'\"<>]+)@gsi); 2347 return () unless @imgs; 2348 2349 my $n = @imgs; 2350 my $i = int(rand($n)); 2351 my $page = $imgs[$i]; 2352 $page =~ s/[?&].*$//s; 2353 $page = "http://imgur.com$page" if ($page =~ m@^/@s); 2354 2355 my ($id) = ($page =~ m@([^/?&]+)$@s); 2356 my $img = "http://i.imgur.com/$id.jpg"; 2357 2358 LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img"); 2359 2360 return ($page, $img); 2361} 2362 2363 2364############################################################################ 2365# 2366# Pick images from Tumblr. 2367# 2368############################################################################ 2369 2370my $tumblr_base = 'https://www.tumblr.com/search/'; 2371 2372sub pick_from_tumblr($) { 2373 my $timeout = shift; 2374 2375 # Tumblr doesn't have an "or" search, which means our vocabulary is 2376 # a bit too extensive to work well... 2377 2378 my $words = random_word(); 2379 $last_search = $tumblr_base . $words; 2380 2381 print STDERR "\n\n" if ($verbose_load); 2382 LOG ($verbose_load, "words: $words"); 2383 LOG ($verbose_load, "URL: $last_search"); 2384 2385 $suppress_audit = 1; 2386 2387 my ( $base, $body ) = get_document ($last_search, undef, $timeout); 2388 if (!$base || !$body) { 2389 $body = undef; 2390 return; 2391 } 2392 2393 my @imgs0 = ($body =~ m@<IMG\b([^<>]*)>@gsi); 2394 return () unless @imgs0; 2395 my @imgs; 2396 foreach my $img (@imgs0) { 2397 my ($src) = ($img =~ m@\bsrc=[\"\'](.*?)[\"\']@si); 2398 my ($href) = ($img =~ m@\bdata-pin-url=[\"\'](.*?)[\"\']@si); 2399 next unless ($src && $href); 2400 next if ($src =~ m/^data:/s); 2401 foreach ($src, $href) { $_ = "http://www.tumblr.com$_" if (m@^/@s); } 2402 push @imgs, [$href, $src]; 2403 } 2404 return () unless @imgs; 2405 2406 my $n = @imgs; 2407 my $i = int(rand($n)); 2408 my $page = $imgs[$i]->[0]; 2409 my $img = $imgs[$i]->[1]; 2410 2411 LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img"); 2412 2413 return ($page, $img); 2414} 2415 2416 2417############################################################################ 2418# 2419# Pick images by waiting for driftnet to populate a temp dir with files. 2420# Requires driftnet version 0.1.5 or later. 2421# (Driftnet is a program by Chris Lightfoot that sniffs your local ethernet 2422# for images being downloaded by others.) 2423# Driftnet/webcollage integration by jwz. 2424# 2425############################################################################ 2426 2427# driftnet 2428sub pick_from_driftnet($) { 2429 my ($timeout) = @_; 2430 2431 my $id = $driftnet_magic; 2432 my $dir = $driftnet_dir; 2433 my $start = time; 2434 my $now; 2435 2436 error ("\$driftnet_dir unset?") unless ($dir); 2437 $dir =~ s@/+$@@; 2438 2439 error ("$dir unreadable") unless (-d "$dir/."); 2440 2441 $timeout = $http_timeout unless ($timeout); 2442 $last_search = $id; 2443 2444 while ($now = time, $now < $start + $timeout) { 2445 opendir (my $dir, $dir) || error ("$dir: $!"); 2446 while (my $file = readdir($dir)) { 2447 next if ($file =~ m/^\./); 2448 $file = "$dir/$file"; 2449 closedir ($dir); 2450 LOG ($verbose_load, "picked file $file ($id)"); 2451 return ($id, $file); 2452 } 2453 closedir ($dir); 2454 } 2455 LOG (($verbose_net || $verbose_load), "timed out for $id"); 2456 return (); 2457} 2458 2459 2460sub get_driftnet_file($) { 2461 my ($file) = @_; 2462 2463 error ("\$driftnet_dir unset?") unless ($driftnet_dir); 2464 2465 my $id = $driftnet_magic; 2466 error ("$id: $file not in $driftnet_dir?") 2467 unless ($file =~ m@^\Q$driftnet_dir@o); 2468 2469 open (my $in, '<', $file) || error ("$id: $file: $!"); 2470 my $body = ''; 2471 local $/ = undef; # read entire file 2472 $body = <$in>; 2473 close ($in) || error ("$id: $file: $!"); 2474 unlink ($file) || error ("$id: $file: rm: $!"); 2475 return ($id, $body); 2476} 2477 2478 2479sub spawn_driftnet($) { 2480 my ($cmd) = @_; 2481 2482 # make a directory to use. 2483 while (1) { 2484 my $tmp = $ENV{TEMPDIR} || "/tmp"; 2485 $driftnet_dir = sprintf ("$tmp/driftcollage-%08x", rand(0xffffffff)); 2486 LOG ($verbose_exec, "mkdir $driftnet_dir"); 2487 last if mkdir ($driftnet_dir, 0700); 2488 } 2489 2490 if (! ($cmd =~ m/\s/)) { 2491 # if the command didn't have any arguments in it, then it must be just 2492 # a pointer to the executable. Append the default args to it. 2493 my $dargs = $default_driftnet_cmd; 2494 $dargs =~ s/^[^\s]+//; 2495 $cmd .= $dargs; 2496 } 2497 2498 # point the driftnet command at our newly-minted private directory. 2499 # 2500 $cmd .= " -d $driftnet_dir"; 2501 $cmd .= ">/dev/null" unless ($verbose_exec); 2502 2503 my $pid = fork(); 2504 if ($pid < 0) { error ("fork: $!\n"); } 2505 if ($pid) { 2506 # parent fork 2507 push @pids_to_kill, $pid; 2508 LOG ($verbose_exec, "forked for \"$cmd\""); 2509 } else { 2510 # child fork 2511 nontrapping_system ($cmd) || error ("exec: $!"); 2512 } 2513 2514 # wait a bit, then make sure the process actually started up. 2515 # 2516 sleep (1); 2517 error ("pid $pid failed to start \"$cmd\"") 2518 unless (1 == kill (0, $pid)); 2519} 2520 2521# local-directory 2522sub pick_from_local_dir($) { 2523 my ($timeout) = @_; 2524 2525 my $id = $local_magic; 2526 $last_search = $id; 2527 2528 my $dir = $local_dir; 2529 error ("\$local_dir unset?") unless ($dir); 2530 $dir =~ s@/+$@@; 2531 2532 error ("$dir unreadable") unless (-d "$dir/."); 2533 2534 my $v = ($verbose_exec ? "-v" : ""); 2535 my $pick = `xscreensaver-getimage-file $v "$dir"`; 2536 $pick =~ s/\s+$//s; 2537 $pick = "$dir/$pick" unless ($pick =~ m@^/@s); # relative path 2538 2539 LOG ($verbose_load, "picked file $pick ($id)"); 2540 return ($id, $pick); 2541} 2542 2543 2544sub get_local_file($) { 2545 my ($file) = @_; 2546 2547 error ("\$local_dir unset?") unless ($local_dir); 2548 2549 my $id = $local_magic; 2550 error ("$id: $file not in $local_dir?") 2551 unless ($file =~ m@^\Q$local_dir@o); 2552 2553 open (my $in, '<:raw', $file) || error ("$id: $file: $!"); 2554 local $/ = undef; # read entire file 2555 my $body = <$in>; 2556 close ($in) || error ("$id: $file: $!"); 2557 return ($id, $body); 2558} 2559 2560 2561 2562############################################################################ 2563# 2564# Pick a random image in a random way 2565# 2566############################################################################ 2567 2568 2569# Picks a random image on a random page, and returns two URLs: 2570# the page containing the image, and the image. 2571# Returns () if nothing found this time. 2572# 2573 2574sub pick_image(;$) { 2575 my ($timeout) = @_; 2576 2577 $current_state = "select"; 2578 $load_method = "none"; 2579 2580 my $n = int(rand(100)); 2581 my $fn = undef; 2582 my $total = 0; 2583 my @rest = @search_methods; 2584 2585 while (@rest) { 2586 my $pct = shift @rest; 2587 my $name = shift @rest; 2588 my $tfn = shift @rest; 2589 $total += $pct; 2590 if ($total > $n && !defined($fn)) { 2591 $fn = $tfn; 2592 $current_state = $name; 2593 $load_method = $current_state; 2594 } 2595 } 2596 2597 if ($total != 100) { 2598 error ("internal error: \@search_methods totals to $total%!"); 2599 } 2600 2601 record_attempt ($current_state); 2602 return $fn->($timeout); 2603} 2604 2605 2606 2607############################################################################ 2608# 2609# Statistics and logging 2610# 2611############################################################################ 2612 2613sub timestr() { 2614 return strftime ("%H:%M:%S: ", localtime); 2615} 2616 2617sub blurb() { 2618 return "$progname: " . timestr() . "$current_state: "; 2619} 2620 2621sub error($) { 2622 my ($err) = @_; 2623 print STDERR blurb() . "$err\n"; 2624 exit 1; 2625} 2626 2627sub stacktrace() { 2628 my $i = 1; 2629 print STDERR "$progname: stack trace:\n"; 2630 while (1) { 2631 my ($package, $filename, $line, $subroutine) = caller($i++); 2632 last unless defined($package); 2633 $filename =~ s@^.*/@@; 2634 print STDERR " $filename#$line, $subroutine\n"; 2635 } 2636} 2637 2638 2639my $lastlog = ""; 2640 2641sub clearlog() { 2642 $lastlog = ""; 2643} 2644 2645sub showlog() { 2646 my $head = "$progname: DEBUG: "; 2647 foreach (split (/\n/, $lastlog)) { 2648 print STDERR "$head$_\n"; 2649 } 2650 $lastlog = ""; 2651} 2652 2653sub LOG($$) { 2654 my ($print, $msg) = @_; 2655 my $blurb = timestr() . "$current_state: "; 2656 $lastlog .= "$blurb$msg\n"; 2657 print STDERR "$progname: $blurb$msg\n" if $print; 2658} 2659 2660 2661my %stats_attempts; 2662my %stats_successes; 2663my %stats_elapsed; 2664 2665my $last_state = undef; 2666sub record_attempt($) { 2667 my ($name) = @_; 2668 2669 if ($last_state) { 2670 record_failure($last_state) unless ($image_succeeded > 0); 2671 } 2672 $last_state = $name; 2673 2674 clearlog(); 2675 report_performance(); 2676 2677 start_timer($name); 2678 $image_succeeded = 0; 2679 $suppress_audit = 0; 2680} 2681 2682sub record_success($$$) { 2683 my ($name, $url, $base) = @_; 2684 if (defined($stats_successes{$name})) { 2685 $stats_successes{$name}++; 2686 } else { 2687 $stats_successes{$name} = 1; 2688 } 2689 2690 stop_timer ($name, 1); 2691 my $o = $current_state; 2692 $current_state = $name; 2693 save_recent_url ($url, $base); 2694 $current_state = $o; 2695 $image_succeeded = 1; 2696 clearlog(); 2697} 2698 2699 2700sub record_failure($) { 2701 my ($name) = @_; 2702 2703 return if $image_succeeded; 2704 2705 stop_timer ($name, 0); 2706 if ($verbose_load && !$verbose_exec) { 2707 2708 if ($suppress_audit) { 2709 print STDERR "$progname: " . timestr() . "(audit log suppressed)\n"; 2710 return; 2711 } 2712 2713 my $o = $current_state; 2714 $current_state = "DEBUG"; 2715 2716 my $line = "#" x 78; 2717 print STDERR "\n\n\n"; 2718 print STDERR ("#" x 78) . "\n"; 2719 print STDERR blurb() . "failed to get an image. Full audit log:\n"; 2720 print STDERR "\n"; 2721 showlog(); 2722 print STDERR ("-" x 78) . "\n"; 2723 print STDERR "\n\n"; 2724 2725 $current_state = $o; 2726 } 2727 $image_succeeded = 0; 2728} 2729 2730 2731 2732sub stats_of($) { 2733 my ($name) = @_; 2734 my $i = $stats_successes{$name}; 2735 my $j = $stats_attempts{$name}; 2736 $i = 0 unless $i; 2737 $j = 0 unless $j; 2738 return "" . ($j ? int($i * 100 / $j) : "0") . "%"; 2739} 2740 2741 2742my $current_start_time = 0; 2743 2744sub start_timer($) { 2745 my ($name) = @_; 2746 $current_start_time = time; 2747 2748 if (defined($stats_attempts{$name})) { 2749 $stats_attempts{$name}++; 2750 } else { 2751 $stats_attempts{$name} = 1; 2752 } 2753 if (!defined($stats_elapsed{$name})) { 2754 $stats_elapsed{$name} = 0; 2755 } 2756} 2757 2758sub stop_timer($$) { 2759 my ($name, $success) = @_; 2760 $stats_elapsed{$name} += time - $current_start_time; 2761} 2762 2763 2764my $last_report_time = 0; 2765sub report_performance() { 2766 2767 return unless $verbose_warnings; 2768 2769 my $now = time; 2770 return unless ($now >= $last_report_time + $report_performance_interval); 2771 my $ot = $last_report_time; 2772 $last_report_time = $now; 2773 2774 return if ($ot == 0); 2775 2776 my $blurb = "$progname: " . timestr(); 2777 2778 print STDERR "\n"; 2779 print STDERR "${blurb}Current standings:\n"; 2780 2781 foreach my $name (sort keys (%stats_attempts)) { 2782 my $try = $stats_attempts{$name}; 2783 my $suc = $stats_successes{$name} || 0; 2784 my $pct = int($suc * 100 / $try); 2785 my $secs = $stats_elapsed{$name}; 2786 my $secs_link = $secs / $try; 2787 print STDERR sprintf ("$blurb %-14s %4s (%d/%d);" . 2788 " \t %.1f secs/link\n", 2789 "$name:", "$pct%", $suc, $try, $secs_link); 2790 } 2791} 2792 2793 2794 2795my $max_recent_images = 400; 2796my $max_recent_sites = 20; 2797my @recent_images = (); 2798my @recent_sites = (); 2799 2800sub save_recent_url($$) { 2801 my ($url, $base) = @_; 2802 2803 return unless ($verbose_warnings); 2804 2805 $_ = $url; 2806 my ($site) = m@^https?://([^ \t\n\r/:]+)@; 2807 return unless defined ($site); 2808 2809 if ($base eq $driftnet_magic || $base eq $local_magic) { 2810 $site = $base; 2811 @recent_images = (); 2812 } 2813 2814 my $done = 0; 2815 foreach (@recent_images) { 2816 if ($_ eq $url) { 2817 print STDERR blurb() . "WARNING: recently-duplicated image: $url" . 2818 " (on $base via $last_search)\n"; 2819 $done = 1; 2820 last; 2821 } 2822 } 2823 2824 # suppress "duplicate site" warning via %warningless_sites. 2825 # 2826 if ($warningless_sites{$site}) { 2827 $done = 1; 2828 } elsif ($site =~ m@([^.]+\.[^.]+\.[^.]+)$@ && 2829 $warningless_sites{$1}) { 2830 $done = 1; 2831 } elsif ($site =~ m@([^.]+\.[^.]+)$@ && 2832 $warningless_sites{$1}) { 2833 $done = 1; 2834 } 2835 2836 if (!$done) { 2837 foreach (@recent_sites) { 2838 if ($_ eq $site) { 2839 print STDERR blurb() . "WARNING: recently-duplicated site: $site" . 2840 " ($url on $base via $last_search)\n"; 2841 last; 2842 } 2843 } 2844 } 2845 2846 push @recent_images, $url; 2847 push @recent_sites, $site; 2848 shift @recent_images if ($#recent_images >= $max_recent_images); 2849 shift @recent_sites if ($#recent_sites >= $max_recent_sites); 2850} 2851 2852 2853 2854############################################################################## 2855# 2856# other utilities 2857# 2858############################################################################## 2859 2860# Does %-decoding. 2861# 2862sub url_decode($) { 2863 ($_) = @_; 2864 tr/+/ /; 2865 s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg; 2866 return $_; 2867} 2868 2869 2870# Given the raw body of a GIF document, returns the dimensions of the image. 2871# 2872sub gif_size($) { 2873 my ($body) = @_; 2874 my $type = substr($body, 0, 6); 2875 my $s; 2876 return () unless ($type =~ /GIF8[7,9]a/); 2877 $s = substr ($body, 6, 10); 2878 my ($a,$b,$c,$d) = unpack ("C"x4, $s); 2879 return () unless defined ($d); 2880 return (($b<<8|$a), ($d<<8|$c)); 2881} 2882 2883# Given the raw body of a JPEG document, returns the dimensions of the image. 2884# 2885sub jpeg_size($) { 2886 my ($body) = @_; 2887 my $i = 0; 2888 my $L = length($body); 2889 2890 my $c1 = substr($body, $i, 1); $i++; 2891 my $c2 = substr($body, $i, 1); $i++; 2892 return () unless (ord($c1) == 0xFF && ord($c2) == 0xD8); 2893 2894 my $ch = "0"; 2895 while (ord($ch) != 0xDA && $i < $L) { 2896 # Find next marker, beginning with 0xFF. 2897 while (ord($ch) != 0xFF) { 2898 return () if (length($body) <= $i); 2899 $ch = substr($body, $i, 1); $i++; 2900 } 2901 # markers can be padded with any number of 0xFF. 2902 while (ord($ch) == 0xFF) { 2903 return () if (length($body) <= $i); 2904 $ch = substr($body, $i, 1); $i++; 2905 } 2906 2907 # $ch contains the value of the marker. 2908 my $marker = ord($ch); 2909 2910 if (($marker >= 0xC0) && 2911 ($marker <= 0xCF) && 2912 ($marker != 0xC4) && 2913 ($marker != 0xCC)) { # it's a SOFn marker 2914 $i += 3; 2915 return () if (length($body) <= $i); 2916 my $s = substr($body, $i, 4); $i += 4; 2917 my ($a,$b,$c,$d) = unpack("C"x4, $s); 2918 return (($c<<8|$d), ($a<<8|$b)); 2919 2920 } else { 2921 # We must skip variables, since FFs in variable names aren't 2922 # valid JPEG markers. 2923 return () if (length($body) <= $i); 2924 my $s = substr($body, $i, 2); $i += 2; 2925 my ($c1, $c2) = unpack ("C"x2, $s); 2926 my $length = ($c1 << 8) | $c2; 2927 return () if ($length < 2); 2928 $i += $length-2; 2929 } 2930 } 2931 return (); 2932} 2933 2934# Given the raw body of a PNG document, returns the dimensions of the image. 2935# 2936sub png_size($) { 2937 my ($body) = @_; 2938 return () unless ($body =~ m/^\211PNG\r/); 2939 my ($bits) = ($body =~ m/^.{12}(.{12})/s); 2940 return () unless defined ($bits); 2941 return () unless ($bits =~ /^IHDR/); 2942 my ($ign, $w, $h) = unpack("a4N2", $bits); 2943 return ($w, $h); 2944} 2945 2946 2947# Given the raw body of a PNM document, returns the dimensions of the image. 2948# 2949sub pnm_size($) { 2950 my ($body) = @_; 2951 return () unless ($body =~ m/^P[1-6]\r?\n(\d+) +(\d+)\r?\n/s); 2952 return ($1, $2); 2953} 2954 2955 2956# Given the raw body of a GIF, JPEG, or PNG document, returns the dimensions 2957# of the image. 2958# 2959sub image_size($) { 2960 my ($body) = @_; 2961 my ($w, $h) = gif_size ($body); 2962 if ($w && $h) { return ($w, $h); } 2963 ($w, $h) = jpeg_size ($body); 2964 if ($w && $h) { return ($w, $h); } 2965 return png_size ($body); 2966 if ($w && $h) { return ($w, $h); } 2967 return pnm_size ($body); 2968} 2969 2970 2971# returns the full path of the named program, or undef. 2972# 2973sub which($) { 2974 my ($prog) = @_; 2975 foreach (split (/:/, $ENV{PATH})) { 2976 my $path = "$_/$prog"; 2977 if (-x $path) { 2978 return $path; 2979 } 2980 } 2981 return undef; 2982} 2983 2984 2985# Like rand(), but chooses numbers with a bell curve distribution. 2986sub bellrand(;$) { 2987 ($_) = @_; 2988 $_ = 1.0 unless defined($_); 2989 $_ /= 3.0; 2990 return (rand($_) + rand($_) + rand($_)); 2991} 2992 2993 2994sub exit_cleanup() { 2995 x_cleanup(); 2996 print STDERR "$progname: exiting\n" if ($verbose_warnings); 2997 if (@pids_to_kill) { 2998 print STDERR blurb() . "killing: " . join(' ', @pids_to_kill) . "\n"; 2999 kill ('TERM', @pids_to_kill); 3000 } 3001} 3002 3003sub signal_cleanup($) { 3004 my ($sig) = @_; 3005 print STDERR blurb() . (defined($sig) 3006 ? "caught signal $sig." 3007 : "exiting.") 3008 . "\n" 3009 if ($verbose_exec || $verbose_warnings); 3010 exit 1; 3011} 3012 3013 3014 3015############################################################################## 3016# 3017# Generating a list of urls only 3018# 3019############################################################################## 3020 3021sub url_only_output() { 3022 do { 3023 my ($base, $img) = pick_image; 3024 if ($img) { 3025 $base =~ s/ /%20/g; 3026 $img =~ s/ /%20/g; 3027 print "$img $base\n"; 3028 } 3029 } while (1); 3030} 3031 3032############################################################################## 3033# 3034# Running as an xscreensaver module, or as a web page imagemap 3035# 3036############################################################################## 3037 3038my ($image_png, $image_tmp1, $image_tmp2); 3039{ 3040 my $seed = rand(0xFFFFFFFF); 3041 $image_png = sprintf ("%s/webcollage-%08x", 3042 ($ENV{TMPDIR} ? $ENV{TMPDIR} : "/tmp"), 3043 $seed); 3044 $image_tmp1 = $image_png . '-1.png'; 3045 $image_tmp2 = $image_png . '-2.png'; 3046 $image_png .= '.png'; 3047} 3048 3049 3050my $filter_cmd = undef; 3051my $post_filter_cmd = undef; 3052my $background = undef; 3053 3054my @imagemap_areas = (); 3055my $imagemap_html_tmp = undef; 3056my $imagemap_jpg_tmp = undef; 3057 3058 3059my $img_width; # size of the image being generated. 3060my $img_height; 3061 3062my $delay = 2; 3063 3064sub x_cleanup() { 3065 unlink $image_png, $image_tmp1, $image_tmp2; 3066 unlink $imagemap_html_tmp, $imagemap_jpg_tmp 3067 if (defined ($imagemap_html_tmp)); 3068} 3069 3070 3071# Like system, but prints status about exit codes, and kills this process 3072# with whatever signal killed the sub-process, if any. 3073# 3074sub nontrapping_system(@) { 3075 $! = 0; 3076 3077 $_ = join(" ", @_); 3078 s/\"[^\"]+\"/\"...\"/g; 3079 3080 LOG ($verbose_exec, "executing \"$_\""); 3081 3082 my $rc = system @_; 3083 3084 if ($rc == 0) { 3085 LOG ($verbose_exec, "subproc exited normally."); 3086 } elsif (($rc & 0xff) == 0) { 3087 $rc >>= 8; 3088 LOG ($verbose_exec, "subproc exited with status $rc."); 3089 } else { 3090 if ($rc & 0x80) { 3091 LOG ($verbose_exec, "subproc dumped core."); 3092 $rc &= ~0x80; 3093 } 3094 LOG ($verbose_exec, "subproc died with signal $rc."); 3095 # die that way ourselves. 3096 kill $rc, $$; 3097 } 3098 3099 return $rc; 3100} 3101 3102 3103# Creates a solid-colored PNG. 3104# 3105sub pngmake($$$$) { 3106 my ($outfile, $bgcolor, $w, $h) = @_; 3107 3108 my @cmd; 3109 if ($webcollage_helper) { 3110 @cmd = ($webcollage_helper, $bgcolor, $w, $h, $outfile); 3111 } else { 3112 @cmd = ($convert_cmd, '-size', "${w}x${h}", "xc:$bgcolor", $outfile); 3113 } 3114 3115 my $rc = nontrapping_system (@cmd); 3116 if ($rc != 0) { 3117 LOG(0, "failed to create $bgcolor image: \"$outfile\""); 3118 exit(1); 3119 } 3120} 3121 3122 3123sub pick_root_displayer() { 3124 my @names = (); 3125 3126 if ($cocoa_p) { 3127 # see "xscreensaver/hacks/webcollage-cocoa.m" 3128 return "echo COCOA LOAD "; 3129 } 3130 3131 foreach my $cmd (@root_displayers) { 3132 $_ = $cmd; 3133 my ($name) = m/^([^ ]+)/; 3134 push @names, "\"$name\""; 3135 LOG ($verbose_exec, "looking for $name..."); 3136 foreach my $dir (split (/:/, $ENV{PATH})) { 3137 LOG ($verbose_exec, " checking $dir/$name"); 3138 return $cmd if (-x "$dir/$name"); 3139 } 3140 } 3141 3142 $names[$#names] = "or " . $names[$#names]; 3143 error "none of: " . join (", ", @names) . " were found on \$PATH."; 3144} 3145 3146 3147my $png_to_root_window_cmd = undef; 3148 3149 3150sub x_or_image_output($) { 3151 my ($window_id) = @_; 3152 3153 # Adjust the PATH for OS X 10.10. 3154 # 3155 $_ = $0; 3156 s:/[^/]*$::; 3157 s/([^a-zA-Z0-9._\-+\/])/\\$1/g; 3158 $ENV{PATH} = "$_:$ENV{PATH}"; 3159 3160 # Check for our helper program, to see whether we need to use imagemagick. 3161 # 3162 $_ = "webcollage-helper"; 3163 3164 if (! defined ($webcollage_helper)) { 3165 $webcollage_helper = which ($_); 3166 } 3167 3168 if (defined ($webcollage_helper)) { 3169 LOG ($verbose_decode, "found \"$webcollage_helper\""); 3170 } else { 3171 LOG (($verbose_decode || $verbose_load), "no $_ program"); 3172 } 3173 3174 if ($cocoa_p && !defined ($webcollage_helper)) { 3175 error ("webcollage-helper not found in Cocoa-mode!"); 3176 } 3177 3178 if (!$cocoa_p && defined ($webcollage_helper)) { 3179 foreach ($image_png, $image_tmp1, $image_tmp2) { 3180 s/\.png$/.jpg/si; 3181 } 3182 } 3183 3184 # make sure the various programs we execute exist, right up front. 3185 # 3186 my @progs = (); 3187 3188 push @progs, $convert_cmd unless defined($webcollage_helper); 3189 3190 foreach (@progs) { 3191 which ($_) || error "$_ not found on \$PATH."; 3192 } 3193 3194 # find a root-window displayer program. 3195 # 3196 if (!$no_output_p) { 3197 $png_to_root_window_cmd = pick_root_displayer(); 3198 } 3199 3200 if (defined ($window_id)) { 3201 error ("-window-id only works if xscreensaver-getimage is installed") 3202 unless ($png_to_root_window_cmd =~ m/^xscreensaver-getimage\b/); 3203 3204 error ("unparsable window id: $window_id") 3205 unless ($window_id =~ m/^\d+$|^0x[\da-f]+$/i); 3206 $png_to_root_window_cmd =~ s/--?root\b/$window_id/ || 3207 error ("unable to munge displayer: $png_to_root_window_cmd"); 3208 } 3209 3210 if (!$img_width || !$img_height) { 3211 3212 if (!defined ($window_id) && 3213 defined ($ENV{XSCREENSAVER_WINDOW})) { 3214 $window_id = $ENV{XSCREENSAVER_WINDOW}; 3215 } 3216 3217 if (!defined ($window_id)) { 3218 $_ = "xdpyinfo"; 3219 which ($_) || error "$_ not found on \$PATH."; 3220 $_ = `$_`; 3221 ($img_width, $img_height) = m/dimensions: *(\d+)x(\d+) /; 3222 if (!defined($img_height)) { 3223 error "xdpyinfo failed."; 3224 } 3225 } else { # we have a window id 3226 $_ = "xwininfo"; 3227 which ($_) || error "$_ not found on \$PATH."; 3228 $_ .= " -id $window_id"; 3229 $_ = `$_`; 3230 ($img_width, $img_height) = m/^\s*Width:\s*(\d+)\n\s*Height:\s*(\d+)\n/m; 3231 3232 if (!defined($img_height)) { 3233 error "xwininfo failed."; 3234 } 3235 } 3236 } 3237 3238 my $bgcolor = "#000000"; 3239 my $bgimage = undef; 3240 3241 if ($background) { 3242 if ($background =~ m/^\#[0-9a-f]+$/i) { 3243 $bgcolor = $background; 3244 3245 } elsif (-r $background) { 3246 $bgimage = $background; 3247 3248 } elsif (! $background =~ m@^[-a-z0-9 ]+$@i) { 3249 error "not a color or readable file: $background"; 3250 3251 } else { 3252 # default to assuming it's a color 3253 $bgcolor = $background; 3254 } 3255 } 3256 3257 # Create the sold-colored base image. 3258 # 3259 LOG ($verbose_decode, "creating base image: ${img_width}x${img_height}"); 3260 $_ = pngmake ($image_png, $bgcolor, $img_width, $img_height); 3261 3262 # Paste the default background image in the middle of it. 3263 # 3264 if ($bgimage) { 3265 open (my $in, '<:raw', $bgimage) || error ("$bgimage: $!"); 3266 local $/ = undef; # read entire file 3267 my $body = <$in>; 3268 close ($in) || error ("$bgimage: $!"); 3269 paste_image ('init', $image_png, $body, 'init', 1); 3270 } 3271 3272 clearlog(); 3273 3274 while (1) { 3275 my ($base, $img) = pick_image(); 3276 my $source = $current_state; 3277 $current_state = "loadimage"; 3278 if ($img) { 3279 my ($headers, $body) = get_document ($img, $base); 3280 if ($body) { 3281 paste_image ($base, $img, $body, $source); 3282 $body = undef; 3283 } 3284 } 3285 $current_state = "idle"; 3286 $load_method = "none"; 3287 3288 unlink $image_tmp1, $image_tmp2; 3289 sleep $delay; 3290 } 3291} 3292 3293sub paste_image($$$$;$) { 3294 my ($base, $img, $body, $source, $init_p) = @_; 3295 3296 $current_state = "paste"; 3297 3298 $suppress_audit = 0; 3299 3300 LOG ($verbose_decode, "got $img (" . length($body) . ")"); 3301 3302 my ($iw, $ih) = image_size ($body); 3303 if (!$iw || !$ih) { 3304 LOG (($verbose_decode || $verbose_load), 3305 "not a GIF, JPG, or PNG" . 3306 (($body =~ m@<(base|html|head|body|script|table|a href)>@i) 3307 ? " (looks like HTML)" : "") . 3308 ": $img"); 3309 $suppress_audit = 1; 3310 $body = undef; 3311 return 0; 3312 } 3313 3314 if ($iw <= 0 || $ih <= 0 || $iw > 9999 || $ih > 9999) { 3315 LOG (($verbose_decode || $verbose_load), 3316 "ludicrous image dimensions: $iw x $ih (" . length($body) . 3317 "): $img"); 3318 $body = undef; 3319 return 0; 3320 } 3321 3322 open (my $out, '>:raw', $image_tmp1) || error ("writing $image_tmp1: $!"); 3323 (print $out $body) || error ("writing $image_tmp1: $!"); 3324 close ($out) || error ("writing $image_tmp1: $!"); 3325 3326 record_success ($load_method, $img, $base); 3327 3328 my $ow = $iw; # used only for error messages 3329 my $oh = $ih; 3330 3331 # don't just tack this onto the front of the pipeline -- we want it to 3332 # be able to change the size of the input image. 3333 # 3334 if ($filter_cmd && !$init_p) { 3335 LOG ($verbose_decode, "running $filter_cmd"); 3336 3337 # #### Historically, $filter_cmd read and write PPM files. 3338 # This is doing PNG or JPEG now. I'll bet nobody uses this. 3339 3340 my $rc = nontrapping_system "($filter_cmd) < $image_tmp1 >$image_tmp2"; 3341 if ($rc != 0) { 3342 LOG(($verbose_decode || $verbose_load), 3343 "failed command: \"$filter_cmd\""); 3344 LOG(($verbose_decode || $verbose_load), 3345 "failed URL: \"$img\" (${ow}x$oh)"); 3346 return; 3347 } 3348 rename ($image_tmp2, $image_tmp1); 3349 3350 # re-get the width/height in case the filter resized it. 3351 open (my $imgf, '<:raw', $image_tmp1) || return 0; 3352 my $b = ''; 3353 sysread ($imgf, $b, 10240); 3354 close $imgf; 3355 ($iw, $ih) = image_size ($b); 3356 return 0 unless ($iw && $ih); 3357 } 3358 3359 my $target_w = $img_width; # max rectangle into which the image must fit 3360 my $target_h = $img_height; 3361 3362 my $scale = 1.0; 3363 3364 my $crop_x = 0; # the sub-rectangle of the image 3365 my $crop_y = 0; # that we will actually paste. 3366 my $crop_w = $iw; 3367 my $crop_h = $ih; 3368 my $x = 0; 3369 my $y = 0; 3370 3371 if (!$init_p) { 3372 3373 # Usually scale the image to fit on the screen -- but sometimes scale it 3374 # to fit on half or a quarter of the screen. (We do this by reducing the 3375 # size of the target rectangle.) Note that the image is not merely scaled 3376 # to fit; we instead cut the image in half repeatedly until it fits in the 3377 # target rectangle -- that gives a wider distribution of sizes. 3378 # 3379 if (rand() < 0.3) { $target_w /= 2; $target_h /= 2; } # reduce target rect 3380 if (rand() < 0.3) { $target_w /= 2; $target_h /= 2; } 3381 3382 if ($iw > $target_w || $ih > $target_h) { 3383 while ($iw > $target_w || 3384 $ih > $target_h) { 3385 $iw = int($iw / 2); 3386 $ih = int($ih / 2); 3387 $scale /= 2; 3388 } 3389 if ($iw <= 10 || $ih <= 10) { 3390 LOG ($verbose_decode, "scaling ${ow}x${oh} to ${iw}x$ih" . 3391 " would have been bogus."); 3392 return 0; 3393 } 3394 3395 $crop_w = $iw; 3396 $crop_h = $ih; 3397 3398 LOG ($verbose_decode, "scaling ${ow}x${oh} to ${iw}x$ih ($scale)"); 3399 } 3400 3401 3402 my $src = $image_tmp1; 3403 3404 # The chance that we will randomly crop out a section of an image starts 3405 # out fairly low, but goes up for images that are very large, or images 3406 # that have ratios that make them look like banners (we try to avoid 3407 # banner images entirely, but they slip through when the IMG tags didn't 3408 # have WIDTH and HEIGHT specified.) 3409 # 3410 my $crop_chance = 0.2; 3411 if ($iw > $img_width * 0.4 || $ih > $img_height * 0.4) { 3412 $crop_chance += 0.2; 3413 } 3414 if ($iw > $img_width * 0.7 || $ih > $img_height * 0.7) { 3415 $crop_chance += 0.2; 3416 } 3417 if ($min_ratio && ($iw * $min_ratio) > $ih) { 3418 $crop_chance += 0.7; 3419 } 3420 3421 if ($crop_chance > 0.1) { 3422 LOG ($verbose_decode, "crop chance: $crop_chance"); 3423 } 3424 3425 if (rand() < $crop_chance) { 3426 3427 my $ow = $crop_w; 3428 my $oh = $crop_h; 3429 3430 if ($crop_w > $min_width) { 3431 # if it's a banner, select the width linearly. 3432 # otherwise, select a bell. 3433 my $r = (($min_ratio && ($iw * $min_ratio) > $ih) 3434 ? rand() 3435 : bellrand()); 3436 $crop_w = $min_width + int ($r * ($crop_w - $min_width)); 3437 $crop_x = int (rand() * ($ow - $crop_w)); 3438 } 3439 if ($crop_h > $min_height) { 3440 # height always selects as a bell. 3441 $crop_h = $min_height + int (bellrand() * ($crop_h - $min_height)); 3442 $crop_y = int (rand() * ($oh - $crop_h)); 3443 } 3444 3445 # Clip it to the actual post-scaling image size. 3446 if ($crop_x + $crop_w > $iw) { $crop_w = $iw - $crop_x; } 3447 if ($crop_y + $crop_h > $ih) { $crop_h = $ih - $crop_y; } 3448 if ($crop_x < 0) { $crop_w += $crop_x; $crop_x = 0; } 3449 if ($crop_y < 0) { $crop_h += $crop_y; $crop_y = 0; } 3450 3451 if ($crop_x != 0 || $crop_y != 0 || 3452 $crop_w != $iw || $crop_h != $ih) { 3453 LOG ($verbose_decode, 3454 "randomly cropping to ${crop_w}x$crop_h \@ $crop_x,$crop_y"); 3455 } 3456 } 3457 3458 # Where the image should logically land -- this might be negative. 3459 # 3460 $x = int((rand() * ($img_width + $crop_w/2)) - $crop_w*3/4); 3461 $y = int((rand() * ($img_height + $crop_h/2)) - $crop_h*3/4); 3462 3463 # if we have chosen to paste the image outside of the rectangle of the 3464 # screen, then we need to crop it. 3465 # 3466 if ($x < 0 || 3467 $y < 0 || 3468 $x + $crop_w > $img_width || 3469 $y + $crop_h > $img_height) { 3470 3471 LOG ($verbose_decode, 3472 "cropping for effective paste of ${crop_w}x${crop_h} \@ $x,$y"); 3473 3474 if ($x < 0) { $crop_x -= $x; $crop_w += $x; $x = 0; } 3475 if ($y < 0) { $crop_y -= $y; $crop_h += $y; $y = 0; } 3476 3477 if ($x + $crop_w >= $img_width) { $crop_w = $img_width - $x - 1; } 3478 if ($y + $crop_h >= $img_height) { $crop_h = $img_height - $y - 1; } 3479 } 3480 3481 # If any cropping needs to happen, add pnmcut. 3482 # 3483 if ($crop_x != 0 || $crop_y != 0 || 3484 $crop_w != $iw || $crop_h != $ih) { 3485 $iw = $crop_w; 3486 $ih = $crop_h; 3487 LOG ($verbose_decode, "cropping to ${crop_w}x$crop_h \@ " . 3488 "$crop_x,$crop_y"); 3489 } 3490 3491 LOG ($verbose_decode, "pasting ${iw}x$ih \@ $x,$y in $image_png"); 3492 } 3493 3494 my @cmd; 3495 if (defined ($webcollage_helper)) { 3496 @cmd = ($webcollage_helper, 3497 $image_tmp1, $image_png, 3498 $scale, $opacity, 3499 $crop_x, $crop_y, $x, $y, 3500 $iw, $ih); 3501 } else { 3502 @cmd = ($convert_cmd, 3503 $image_png, 3504 '(', 3505 $image_tmp1 . '[0]', 3506 '-scale', sprintf("%.2f%%", 100 * $scale), 3507 '-crop', "${iw}x${ih}+${crop_x}+${crop_y}", 3508 '-geometry', "+${x}+${y}", 3509 3510 ($init_p ? () : 3511 ( 3512 # Blurry edges with rounded corners 3513 '-alpha', 'set', 3514 '-virtual-pixel', 'transparent', 3515 '-channel', 'A', 3516 '-blur', '0x12', 3517 '-level', '50%,100%', 3518 3519 # Overall transparency 3520 '-evaluate', 'multiply', $opacity, 3521 3522 '+channel', 3523 )), 3524 ')', 3525 '-composite', 3526 '+repage', 3527 '-strip', 3528 $image_png); 3529 } 3530 3531 #### $verbose_decode should mean 2>/dev/null 3532 3533 my $rc = nontrapping_system (@cmd); 3534 3535 if (-z $image_png) { 3536 LOG (1, "failed command: \"@cmd\""); 3537 print STDERR "\naudit log:\n\n\n"; 3538 print STDERR ("#" x 78) . "\n"; 3539 print STDERR blurb() . "$image_png has zero size\n"; 3540 showlog(); 3541 print STDERR "\n\n"; 3542 exit (1); 3543 } 3544 3545 if ($rc != 0) { 3546 LOG (($verbose_decode || $verbose_load), "failed command: \"@cmd\""); 3547 LOG (($verbose_decode || $verbose_load), 3548 "failed URL: \"$img\" (${ow}x$oh)"); 3549 return; 3550 } 3551 3552 my $target = "$image_png"; 3553 3554 # don't just tack this onto the end of the pipeline -- we don't want it 3555 # to end up in $image_png, because we don't want the results to be 3556 # cumulative. 3557 # 3558 if ($post_filter_cmd) { 3559 3560 # #### Historically, $post_filter_cmd read and write PPM files. 3561 # This is doing PNG or JPEG now. I'll bet nobody uses this. 3562 3563 $target = $image_tmp1; 3564 my $cmd = "($post_filter_cmd) < $image_png > $target"; 3565 $rc = nontrapping_system ($cmd); 3566 if ($rc != 0) { 3567 LOG ($verbose_decode, "filter failed: \"$post_filter_cmd\"\n"); 3568 return; 3569 } 3570 } 3571 3572 if (!$no_output_p) { 3573 my $tsize = (stat($target))[7]; 3574 if ($tsize > 200) { 3575 my $cmd = "$png_to_root_window_cmd $target"; 3576 3577 # xv seems to hate being killed. it tends to forget to clean 3578 # up after itself, and leaves windows around and colors allocated. 3579 # I had this same problem with vidwhacker, and I'm not entirely 3580 # sure what I did to fix it. But, let's try this: launch xv 3581 # in the background, so that killing this process doesn't kill it. 3582 # it will die of its own accord soon enough. So this means we 3583 # start pumping bits to the root window in parallel with starting 3584 # the next network retrieval, which is probably a better thing 3585 # to do anyway. 3586 # 3587 $cmd .= " &" unless ($cocoa_p); 3588 3589 $rc = nontrapping_system ($cmd); 3590 3591 if ($rc != 0) { 3592 LOG (($verbose_decode || $verbose_load), "display failed: \"$cmd\""); 3593 return; 3594 } 3595 3596 } else { 3597 LOG ($verbose_decode, "$target size is $tsize"); 3598 } 3599 } 3600 3601 if (defined($source)) { 3602 $source .= "-" . stats_of($source); 3603 print STDOUT "image: ${iw}x${ih} @ $x,$y $base $source\n" 3604 if ($verbose_imgmap); 3605 if ($imagemap_base) { 3606 update_imagemap ($base, $x, $y, $iw, $ih, 3607 $image_png, $img_width, $img_height); 3608 } 3609 } 3610 3611 clearlog(); 3612 3613 return 1; 3614} 3615 3616 3617sub update_imagemap($$$$$$$$) { 3618 my ($url, $x, $y, $w, $h, $image_png, $image_width, $image_height) = @_; 3619 3620 $current_state = "imagemap"; 3621 3622 my $max_areas = 200; 3623 3624 $url = html_quote ($url); 3625 push @imagemap_areas, [$x, $y, $w, $h, $url]; 3626 shift @imagemap_areas if (@imagemap_areas > $max_areas); 3627 LOG ($verbose_decode, "area: $x,$y,$w,$h"); 3628 3629 my $map_name = $imagemap_base; 3630 $map_name =~ s@^.*/@@; 3631 $map_name = 'collage' if ($map_name eq ''); 3632 3633 my $imagemap_html = $imagemap_base . ".html"; 3634 my $imagemap_jpg = $imagemap_base . ".jpg"; 3635 my $imagemap_jpg2 = $imagemap_jpg; 3636 $imagemap_jpg2 =~ s@^.*/@@gs; 3637 3638 if (!defined ($imagemap_html_tmp)) { 3639 $imagemap_html_tmp = $imagemap_html . sprintf (".%08x", rand(0xffffffff)); 3640 $imagemap_jpg_tmp = $imagemap_jpg . sprintf (".%08x", rand(0xffffffff)); 3641 } 3642 3643 # Read the imagemap html file (if any) to get a template. 3644 # 3645 my $template_html = ''; 3646 { 3647 if (open (my $in, '<', $imagemap_html)) { 3648 local $/ = undef; # read entire file 3649 $template_html = <$in>; 3650 close $in; 3651 LOG ($verbose_decode, "read template $imagemap_html"); 3652 } 3653 3654 if (! ($template_html =~ m/\.webcollage_box\b/s)) { # missing or old 3655 $template_html = 3656 '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 3657 "http://www.w3.org/TR/html4/loose.dtd"> 3658<HTML> 3659 <HEAD> 3660 <BASE TARGET="_new"> 3661 <meta HTTP-EQUIV="Refresh" content="60" /> 3662 <TITLE>WebCollage</TITLE> 3663 <STYLE TYPE="text/css"> 3664 <!-- 3665 body { color: #FFF; max-width: 100%; } 3666 .webcollage_date, .webcollage_size { 3667 display: block; margin-top: 4px; font-size: 7pt; color: #888; 3668 } 3669 .webcollage_date { float: left; } 3670 .webcollage_size { float: right; } 3671 3672 .webcollage_frame { 3673 overflow: hidden; 3674 position: relative; 3675 padding-bottom: 56.25%; 3676 padding-bottom: 75%; /* 4:3 aspect ratio */ 3677 border: 1px solid #888; 3678 background: #000; 3679 } 3680 3681 .webcollage_box { 3682 position: absolute; top: 0; left: 0; 3683 border: 0; margin: 0; padding: 0; 3684 width: 100%; 3685 height: 100%; 3686 } 3687 3688 .webcollage_box > img { width: 100%; height: 100%; border: 0; } 3689 3690 .webcollage_frame > a { 3691 display: block; 3692 position: absolute; 3693 border-radius: 16px; 3694 } 3695 3696 .webcollage_frame > a:hover { 3697 background: rgba(1,1,1,.25); 3698 } 3699 --> 3700 </STYLE> 3701 </HEAD> 3702 <BODY> 3703 <DIV CLASS="webcollage_frame"> 3704 <DIV CLASS="webcollage_box"></DIV> 3705 </DIV> 3706 </BODY> 3707</HTML> 3708'; 3709 3710 LOG ($verbose_decode, "created dummy template"); 3711 } 3712 } 3713 3714 # Write the jpg to a tmp file 3715 # 3716 { 3717 my @cmd; 3718 if (defined($webcollage_helper)) { 3719 @cmd = ('cp', '-p', $image_png, $imagemap_jpg_tmp); 3720 } else { 3721 @cmd = ($convert_cmd, $image_png, 'jpeg:' . $imagemap_jpg_tmp); 3722 } 3723 my $rc = nontrapping_system (@cmd); 3724 if ($rc != 0) { 3725 error ("imagemap jpeg failed: \"@cmd\"\n"); 3726 } 3727 } 3728 3729 # Write the html to a tmp file 3730 # 3731 { 3732 my $body = $template_html; 3733 my $img = (" <DIV CLASS=\"webcollage_box\">" . 3734 "<IMG SRC=\"$imagemap_jpg2\">" . 3735 "</DIV>\n"); 3736 foreach my $a (@imagemap_areas) { 3737 my ($x, $y, $w, $h, $u) = @$a; 3738 $x /= $img_width / 100; 3739 $y /= $img_height / 100; 3740 $w /= $img_width / 100; 3741 $h /= $img_height / 100; 3742 foreach ($x, $y, $w, $h) { $_ = sprintf("%.1f%%", $_); } 3743 $u = html_quote($u); 3744 $img .= (" <A HREF=\"$u\" STYLE=\"" . 3745 "left:$x;top:$y;width:$w;height:$h\"/>\n"); 3746 } 3747 3748 $img = ("<DIV CLASS=\"webcollage_frame\">\n" . 3749 $img . 3750 " </DIV>\n"); 3751 $body =~ s@<DIV \s+ CLASS=[\"\']webcollage_frame[\"\']> 3752 .*? </DIV> .*? </DIV>@$img@sex || 3753 error ("$imagemap_html_tmp: unable to splice image"); 3754 3755 3756 # if there are magic webcollage spans in the html, update those too. 3757 # 3758 { 3759 my @st = stat ($imagemap_jpg_tmp); 3760 my $date = strftime("%d-%b-%Y %l:%M:%S %p %Z", localtime($st[9])); 3761 my $size = int(($st[7] / 1024) + 0.5) . "K"; 3762 $body =~ s@(<SPAN\s+CLASS=\"webcollage_date\">).*?(</SPAN>)@$1$date$2@si; 3763 $body =~ s@(<SPAN\s+CLASS=\"webcollage_size\">).*?(</SPAN>)@$1$size$2@si; 3764 } 3765 3766 open (my $out, '>', $imagemap_html_tmp) || error("$imagemap_html_tmp: $!"); 3767 (print $out $body) || error("$imagemap_html_tmp: $!"); 3768 close ($out) || error("$imagemap_html_tmp: $!"); 3769 LOG ($verbose_decode, "wrote $imagemap_html_tmp"); 3770 } 3771 3772 # Rename the two tmp files to the real files 3773 # 3774 rename ($imagemap_html_tmp, $imagemap_html) || 3775 error "renaming $imagemap_html_tmp to $imagemap_html"; 3776 LOG ($verbose_decode, "wrote $imagemap_html"); 3777 3778 if ($imagemap_jpg_tmp ne $image_png) { 3779 rename ($imagemap_jpg_tmp, $imagemap_jpg) || 3780 error "renaming $imagemap_jpg_tmp to $imagemap_jpg"; 3781 LOG ($verbose_decode, "wrote $imagemap_jpg"); 3782 } 3783} 3784 3785 3786# Figure out what the proxy server should be, either from environment 3787# variables or by parsing the output of the (MacOS) program "scutil", 3788# which tells us what the system-wide proxy settings are. 3789# 3790sub set_proxy() { 3791 3792 if (! defined($http_proxy)) { 3793 # historical suckage: the environment variable name is lower case. 3794 $http_proxy = $ENV{http_proxy} || $ENV{HTTP_PROXY}; 3795 } 3796 3797 if (defined ($http_proxy)) { 3798 if ($http_proxy && $http_proxy =~ m@^https?://([^/]*)/?$@ ) { 3799 # historical suckage: allow "http://host:port" as well as "host:port". 3800 $http_proxy = $1; 3801 } 3802 3803 } else { 3804 my $proxy_data = `scutil --proxy 2>/dev/null`; 3805 my ($server) = ($proxy_data =~ m/\bHTTPProxy\s*:\s*([^\s]+)/s); 3806 my ($port) = ($proxy_data =~ m/\bHTTPPort\s*:\s*([^\s]+)/s); 3807 # Note: this ignores the "ExceptionsList". 3808 if ($server) { 3809 $http_proxy = $server; 3810 $http_proxy .= ":$port" if $port; 3811 } 3812 } 3813 3814 delete $ENV{http_proxy}; 3815 delete $ENV{HTTP_PROXY}; 3816 delete $ENV{https_proxy}; 3817 delete $ENV{HTTPS_PROXY}; 3818 delete $ENV{PERL_LWP_ENV_PROXY}; 3819 3820 if ($http_proxy) { 3821 $http_proxy = 'http://' . $http_proxy; 3822 LOG ($verbose_net, "proxy server: $http_proxy"); 3823 } else { 3824 $http_proxy = undef; # for --proxy '' 3825 } 3826} 3827 3828 3829sub init_signals() { 3830 3831 $SIG{HUP} = \&signal_cleanup; 3832 $SIG{INT} = \&signal_cleanup; 3833 $SIG{QUIT} = \&signal_cleanup; 3834 $SIG{ABRT} = \&signal_cleanup; 3835 $SIG{KILL} = \&signal_cleanup; 3836 $SIG{TERM} = \&signal_cleanup; 3837 3838 # Need this so that if subprocess filters die, we don't die. 3839 $SIG{PIPE} = 'IGNORE'; 3840} 3841 3842END { exit_cleanup(); } 3843 3844 3845sub main() { 3846 $| = 1; 3847 srand(time ^ $$); 3848 3849 my $verbose = 0; 3850 my $dict; 3851 my $driftnet_cmd = 0; 3852 3853 $current_state = "init"; 3854 $load_method = "none"; 3855 3856 my $root_p = 0; 3857 my $window_id = undef; 3858 3859 while ($#ARGV >= 0) { 3860 $_ = shift @ARGV; 3861 if (m/^--?d(i(s(p(l(a(y)?)?)?)?)?)?$/s) { 3862 $ENV{DISPLAY} = shift @ARGV; 3863 } elsif (m/^--?root$/s) { 3864 $root_p = 1; 3865 } elsif (m/^--?window-id$/s) { 3866 $window_id = shift @ARGV; 3867 $root_p = 1; 3868 } elsif (m/^--?no-output$/s) { 3869 $no_output_p = 1; 3870 } elsif (m/^--?urls(-only)?$/s) { 3871 $urls_only_p = 1; 3872 $no_output_p = 1; 3873 } elsif (m/^--?cocoa$/s) { 3874 $cocoa_p = 1; 3875 } elsif (m/^--?imagemap$/s) { 3876 $imagemap_base = shift @ARGV; 3877 $no_output_p = 1; 3878 } elsif (m/^--?verbose$/s) { 3879 $verbose++; 3880 } elsif (m/^-v+$/) { 3881 $verbose += length($_)-1; 3882 } elsif (m/^--?delay$/s) { 3883 $delay = shift @ARGV; 3884 } elsif (m/^--?timeout$/s) { 3885 $http_timeout = shift @ARGV; 3886 } elsif (m/^--?filter$/s) { 3887 $filter_cmd = shift @ARGV; 3888 } elsif (m/^--?filter2$/s) { 3889 $post_filter_cmd = shift @ARGV; 3890 } elsif (m/^--?(background|bg)$/s) { 3891 $background = shift @ARGV; 3892 } elsif (m/^--?size$/s) { 3893 $_ = shift @ARGV; 3894 if (m@^(\d+)x(\d+)$@) { 3895 $img_width = $1; 3896 $img_height = $2; 3897 } else { 3898 error "argument to \"--size\" must be of the form \"640x400\""; 3899 } 3900 } elsif (m/^--?(http-)?proxy$/s) { 3901 $http_proxy = shift @ARGV; 3902 } elsif (m/^--?dict(ionary)?$/s) { 3903 $dict = shift @ARGV; 3904 } elsif (m/^--?opacity$/s) { 3905 $opacity = shift @ARGV; 3906 error ("opacity must be between 0.0 and 1.0") 3907 if ($opacity <= 0 || $opacity > 1); 3908 } elsif (m/^--?driftnet$/s) { 3909 @search_methods = ( 100, "driftnet", \&pick_from_driftnet ); 3910 if (! ($ARGV[0] =~ m/^-/)) { 3911 $driftnet_cmd = shift @ARGV; 3912 } else { 3913 $driftnet_cmd = $default_driftnet_cmd; 3914 } 3915 } elsif (m/^--?dir(ectory)?$/s) { 3916 @search_methods = ( 100, "local", \&pick_from_local_dir ); 3917 if (! ($ARGV[0] =~ m/^-/)) { 3918 $local_dir = shift @ARGV; 3919 } else { 3920 error ("local directory path must be set") 3921 } 3922 } elsif (m/^--?fps$/s) { 3923 # -fps only works on MacOS, via "webcollage-cocoa.m". 3924 # Ignore it if passed to this script in an X11 context. 3925 } elsif (m/^--?debug$/s) { 3926 my $which = shift @ARGV; 3927 my @rest = @search_methods; 3928 my $ok = 0; 3929 while (@rest) { 3930 my $pct = shift @rest; 3931 my $name = shift @rest; 3932 my $tfn = shift @rest; 3933 3934 if ($name eq $which) { 3935 @search_methods = (100, $name, $tfn); 3936 $ok = 1; 3937 last; 3938 } 3939 } 3940 error "no such search method as \"$which\"" unless ($ok); 3941 LOG (1, "DEBUG: using only \"$which\""); 3942 $report_performance_interval = 30; 3943 3944 } else { 3945 print STDERR "unknown option: $_\n\n"; 3946 print STDERR "$copyright\nusage: $progname " . 3947 "[--root] [--display dpy] [--verbose] [--debug which]\n" . 3948 "\t\t [--timeout secs] [--delay secs] [--size WxH]\n" . 3949 "\t\t [--no-output] [--urls-only] [--imagemap filename]\n" . 3950 "\t\t [--background color] [--opacity f]\n" . 3951 "\t\t [--filter cmd] [--filter2 cmd]\n" . 3952 "\t\t [--dictionary dictionary-file] [--http-proxy host[:port]]\n" . 3953 "\t\t [--driftnet [driftnet-program-and-args]]\n" . 3954 "\t\t [--directory local-image-directory]\n" . 3955 "\n"; 3956 exit 1; 3957 } 3958 } 3959 3960 if (!$root_p && !$no_output_p && !$cocoa_p) { 3961 print STDERR $copyright; 3962 error "the --root argument is mandatory (for now.)"; 3963 } 3964 3965 if (!$no_output_p && !$cocoa_p && !$ENV{DISPLAY}) { 3966 error "\$DISPLAY is not set."; 3967 } 3968 3969 3970 if ($verbose == 1) { 3971 $verbose_imgmap = 1; 3972 $verbose_warnings = 1; 3973 3974 } elsif ($verbose == 2) { 3975 $verbose_imgmap = 1; 3976 $verbose_warnings = 1; 3977 $verbose_load = 1; 3978 3979 } elsif ($verbose == 3) { 3980 $verbose_imgmap = 1; 3981 $verbose_warnings = 1; 3982 $verbose_load = 1; 3983 $verbose_filter = 1; 3984 3985 } elsif ($verbose == 4) { 3986 $verbose_imgmap = 1; 3987 $verbose_warnings = 1; 3988 $verbose_load = 1; 3989 $verbose_filter = 1; 3990 $verbose_net = 1; 3991 3992 } elsif ($verbose == 5) { 3993 $verbose_imgmap = 1; 3994 $verbose_warnings = 1; 3995 $verbose_load = 1; 3996 $verbose_filter = 1; 3997 $verbose_net = 1; 3998 $verbose_decode = 1; 3999 4000 } elsif ($verbose == 6) { 4001 $verbose_imgmap = 1; 4002 $verbose_warnings = 1; 4003 $verbose_load = 1; 4004 $verbose_filter = 1; 4005 $verbose_net = 1; 4006 $verbose_decode = 1; 4007 $verbose_http = 1; 4008 4009 } elsif ($verbose >= 7) { 4010 $verbose_imgmap = 1; 4011 $verbose_warnings = 1; 4012 $verbose_load = 1; 4013 $verbose_filter = 1; 4014 $verbose_net = 1; 4015 $verbose_decode = 1; 4016 $verbose_http = 1; 4017 $verbose_exec = 1; 4018 } 4019 4020 if ($dict) { 4021 error ("$dict does not exist") unless (-f $dict); 4022 $wordlist = $dict; 4023 } else { 4024 pick_dictionary(); 4025 } 4026 4027 if ($imagemap_base && !($img_width && $img_height)) { 4028 error ("--size WxH is required with --imagemap"); 4029 } 4030 4031 if (defined ($local_dir)) { 4032 $_ = "xscreensaver-getimage-file"; 4033 which ($_) || error "$_ not found on \$PATH."; 4034 } 4035 4036 init_signals(); 4037 set_proxy(); 4038 4039 spawn_driftnet ($driftnet_cmd) if ($driftnet_cmd); 4040 4041 if ($urls_only_p) { 4042 url_only_output (); 4043 } else { 4044 x_or_image_output ($window_id); 4045 } 4046} 4047 4048main(); 4049exit (0); 4050