1#!/usr/bin/perl -w
2#
3# webcollage, Copyright © 1999-2019 by Jamie Zawinski <jwz@jwz.org>
4# This program decorates the screen with random images from the web.
5# One satisfied customer described it as "a nonstop pop culture brainbath."
6#
7# Permission to use, copy, modify, distribute, and sell this software and its
8# documentation for any purpose is hereby granted without fee, provided that
9# the above copyright notice appear in all copies and that both that
10# copyright notice and this permission notice appear in supporting
11# documentation.  No representations are made about the suitability of this
12# software for any purpose.  It is provided "as is" without express or
13# implied warranty.
14
15
16# To run this as a display mode with xscreensaver, add this to `programs':
17#
18#     webcollage --root
19#     webcollage --root --filter 'vidwhacker --stdin --stdout'
20#
21#
22# You can see this in action at https://www.jwz.org/webcollage/ --
23# it auto-reloads about once a minute.  To make a page similar to
24# that on your own system, do this:
25#
26#     webcollage --size '800x600' --imagemap $HOME/www/webcollage/index
27#
28#
29# Requires that either the provided "webcollage-helper" program or
30# ImageMagick's "convert" be available on $PATH.
31#
32#
33# If you have the "driftnet" program installed, webcollage can display a
34# collage of images sniffed off your local ethernet, instead of pulled out
35# of search engines: in that way, your screensaver can display the images
36# that your co-workers are downloading!
37#
38# Driftnet is available here: http://www.ex-parrot.com/~chris/driftnet/
39# Use it like so:
40#
41#     webcollage --root --driftnet
42#
43# Driftnet is the Unix implementation of the MacOS "EtherPEG" program.
44
45
46require 5;
47use strict;
48
49# We can't "use diagnostics" here, because that library malfunctions if
50# you signal and catch alarms: it says "Uncaught exception from user code"
51# and exits, even though I damned well AM catching it!
52#use diagnostics;
53
54
55require Time::Local;
56require POSIX;
57use Fcntl ':flock'; # import LOCK_* constants
58use POSIX qw(strftime);
59use LWP::UserAgent;
60
61
62my $progname = $0; $progname =~ s@.*/@@g;
63my ($version) = ('$Revision: 1.183 $' =~ m/\s(\d[.\d]+)\s/s);
64my $copyright = "WebCollage $version, Copyright (c) 1999-2017" .
65    " Jamie Zawinski <jwz\@jwz.org>\n" .
66    "                  https://www.jwz.org/webcollage/\n";
67
68
69
70my @search_methods = (
71                      # Google is rate-limiting us now, so this works ok from
72                      # a short-running screen saver, but not as a batch job.
73                      # I haven't found a workaround.
74                      #
75                        5, "googlephotos",  \&pick_from_google_image_photos,
76                        3, "googleimgs",    \&pick_from_google_images,
77                        3, "googlenums",    \&pick_from_google_image_numbers,
78
79                      # So let's try Bing instead. No rate limiting yet!
80                      #
81                       13, "bingphotos",    \&pick_from_bing_image_photos,
82                       11, "bingimgs",      \&pick_from_bing_images,
83                       10, "bingnums",      \&pick_from_bing_image_numbers,
84
85                       20, "flickr_recent", \&pick_from_flickr_recent,
86                       15, "flickr_random", \&pick_from_flickr_random,
87                        6, "livejournal",   \&pick_from_livejournal_images,
88
89                       11, "imgur",         \&pick_from_imgur,
90
91                     # Tumblr doesn't have an "or" search, so this isn't great.
92                        3, "tumblr",        \&pick_from_tumblr,
93
94                     # I ran out of usable access tokens, May 2017
95                     #  0, "instagram",     \&pick_from_instagram,
96
97                     # No longer exists, as of Apr 2014
98                     #  0, "yahoorand",     \&pick_from_yahoo_random_link,
99
100                     # Twitter destroyed their whole API in 2013.
101                     #  0, "twitpic",       \&pick_from_twitpic_images,
102                     #  0, "twitter",       \&pick_from_twitter_images,
103
104                     # This is a cute way to search for a certain webcams.
105                     # Not included in default methods, since these images
106                     # aren't terribly interesting by themselves.
107                     # See also "SurveillanceSaver".
108                     #
109                        0, "securitycam",   \&pick_from_security_camera,
110
111                     # Nonfunctional as of June 2011.
112                     #  0, "altavista",     \&pick_from_alta_vista_random_link,
113
114                     # In Apr 2002, Google asked me to stop searching them.
115                     # I asked them to add a "random link" url.  They said
116                     # "that would be easy, we'll think about it" and then
117                     # never wrote back.  Booo Google!  Booooo!  So, screw
118                     # those turkeys, I've turned Google searching back on.
119                     # I'm sure they can take it.  (Jan 2005.)
120
121                     # Jan 2005: Yahoo fucked up their search form so that
122                     # it's no longer possible to do "or" searches on news
123                     # images, so we rarely get any hits there any more.
124                     #
125                     #  0, "yahoonews",     \&pick_from_yahoo_news_text,
126
127                     # Dec 2004: the ircimages guy's server can't take the
128                     # heat, so he started banning the webcollage user agent.
129                     # I tried to convince him to add a lighter-weight page to
130                     # support webcollage better, but he doesn't care.
131                     #
132                     #  0, "ircimages",     \&pick_from_ircimages,
133
134                     # Dec 2002: Alta Vista has a new "random link" URL now.
135                     # They added it specifically to better support webcollage!
136                     # That was super cool of them.  This is how we used to do
137                     # it, before:
138                     #
139                     #  0, "avimages",      \&pick_from_alta_vista_images,
140                     #  0, "avtext",        \&pick_from_alta_vista_text,
141
142                     # This broke in 2004.  Eh, Lycos sucks anyway.
143                     #
144                     #  0, "lycos",         \&pick_from_lycos_text,
145
146                     # This broke in 2003, I think.  I suspect Hotbot is
147                     # actually the same search engine data as Lycos.
148                     #
149                     #  0, "hotbot",        \&pick_from_hotbot_text,
150                      );
151
152# programs we can use to write to the root window (tried in ascending order.)
153#
154my @root_displayers = (
155  "xscreensaver-getimage -root -file",
156  "chbg       -once -xscreensaver -max_size 100",
157  "xv         -root -quit -viewonly +noresetroot -quick24 -rmode 5" .
158  "           -rfg black -rbg black",
159  "xli        -quiet -onroot -center -border black",
160  "xloadimage -quiet -onroot -center -border black",
161
162# this lame program wasn't built with vroot.h:
163# "xsri       -scale -keep-aspect -center-horizontal -center-vertical",
164);
165
166
167# Some sites need cookies to work properly.   These are they.
168#
169my %cookies = (
170  "www.altavista.com"  =>  "AV_ALL=1",   # request uncensored searches
171  "web.altavista.com"  =>  "AV_ALL=1",
172  "ircimages.com"      =>  'disclaimer=1',
173);
174
175
176# If this is set, it's a helper program to use for pasting images together:
177# this is somewhat faster than using ImageMagick.
178#
179my $webcollage_helper = undef;
180my $convert_cmd = 'convert';
181
182my $opacity = 0.85;  # Opacity when pasting images together.
183
184
185# Some sites have  managed to poison the search engines.  These are they.
186# (We auto-detect sites that have poisoned the search engines via excessive
187# keywords or dictionary words,  but these are ones that slip through
188# anyway.)
189#
190# This can contain full host names, or 2 or 3 component domains.
191#
192my %poisoners = (
193  "die.net"                 => 1,  # 'l33t h4ck3r d00dz.
194  "genforum.genealogy.com"  => 1,  # Cluttering avtext with human names.
195  "rootsweb.com"            => 1,  # Cluttering avtext with human names.
196  "akamai.net"              => 1,  # Lots of sites have their images on Akamai.
197  "akamaitech.net"          => 1,  # But those are pretty much all banners.
198                                   # Since Akamai is super-expensive, let's
199                                   # go out on a limb and assume that all of
200                                   # their customers are rich-and-boring.
201  "bartleby.com"            => 1,  # Dictionary, cluttering avtext.
202  "encyclopedia.com"        => 1,  # Dictionary, cluttering avtext.
203  "onlinedictionary.datasegment.com" => 1,  # Dictionary, cluttering avtext.
204  "hotlinkpics.com"         => 1,  # Porn site that has poisoned avimages
205                                   # (I don't see how they did it, though!)
206  "alwayshotels.com"        => 1,  # Poisoned Lycos pretty heavily.
207  "nextag.com"              => 1,  # Poisoned Alta Vista real good.
208  "ghettodriveby.com"       => 1,  # Poisoned Google Images.
209  "crosswordsolver.org"     => 1,  # Poisoned Google Images.
210  "xona.com"                => 1,  # Poisoned Google Images.
211  "freepatentsonline.com"   => 1,  # Poisoned Google Images.
212  "herbdatanz.com"          => 1,  # Poisoned Google Images.
213);
214
215
216# When verbosity is turned on, we warn about sites that we seem to be hitting
217# a lot: usually this means some new poisoner has made it into the search
218# engines.  But sometimes, the warning is just because that site has a lot
219# of stuff on it.  So these are the sites that are immune to the "frequent
220# site" diagnostic message.
221#
222my %warningless_sites = (
223  "home.earthlink.net"      => 1,
224  "www.angelfire.com"       => 1,
225  "members.aol.com"         => 1,
226  "img.photobucket.com"     => 1,
227  "pics.livejournal.com"    => 1,
228  "tinypic.com"             => 1,
229  "flickr.com"              => 1,
230  "staticflickr.com"        => 1,
231  "live.staticflickr.com"   => 1,
232  "pbase.com"               => 1,
233  "blogger.com"             => 1,
234  "multiply.com"            => 1,
235  "wikimedia.org"           => 1,
236  "twitpic.com"             => 1,
237  "amazonaws.com"           => 1,
238  "blogspot.com"            => 1,
239  "photoshelter.com"        => 1,
240  "myspacecdn.com"          => 1,
241  "feedburner.com"          => 1,
242  "wikia.com"               => 1,
243  "ljplus.ru"               => 1,
244  "yandex.ru"               => 1,
245  "imgur.com"               => 1,
246  "tumblr.com"              => 1,
247  "yfrog.com"               => 1,
248  "cdninstagram.com"        => 1,
249  "gstatic.com"		    => 1,
250
251  "yimg.com"                => 1,  # This is where dailynews.yahoo.com stores
252  "eimg.com"                => 1,  # its images, so pick_from_yahoo_news_text()
253                                   # hits this every time.
254
255  "images.quizfarm.com"     => 1,  # damn those LJ quizzes...
256  "images.quizilla.com"     => 1,
257  "images.quizdiva.net"     => 1,
258
259  "driftnet"                => 1,  # builtin...
260  "local-directory"         => 1,  # builtin...
261);
262
263
264# For decoding HTML-encoded character entities to URLs.
265# In This Modern World, probably we should use HTML::Entities instead.
266#
267my %entity_table = (
268   "apos"   => '\'',
269   "quot"   => '"',    "amp"    => '&',    "lt"     => '<',
270   "gt"     => '>',    "nbsp"   => ' ',    "iexcl"  => '',
271   "cent"   => "\xA2", "pound"  => "\xA3", "curren" => "\xA4",
272   "yen"    => "\xA5", "brvbar" => "\xA6", "sect"   => "\xA7",
273   "uml"    => "\xA8", "copy"   => "\xA9", "ordf"   => "\xAA",
274   "laquo"  => "\xAB", "not"    => "\xAC", "shy"    => "\xAD",
275   "reg"    => "\xAE", "macr"   => "\xAF", "deg"    => "\xB0",
276   "plusmn" => "\xB1", "sup2"   => "\xB2", "sup3"   => "\xB3",
277   "acute"  => "\xB4", "micro"  => "\xB5", "para"   => "\xB6",
278   "middot" => "\xB7", "cedil"  => "\xB8", "sup1"   => "\xB9",
279   "ordm"   => "\xBA", "raquo"  => "\xBB", "frac14" => "\xBC",
280   "frac12" => "\xBD", "frac34" => "\xBE", "iquest" => "\xBF",
281   "Agrave" => "\xC0", "Aacute" => "\xC1", "Acirc"  => "\xC2",
282   "Atilde" => "\xC3", "Auml"   => "\xC4", "Aring"  => "\xC5",
283   "AElig"  => "\xC6", "Ccedil" => "\xC7", "Egrave" => "\xC8",
284   "Eacute" => "\xC9", "Ecirc"  => "\xCA", "Euml"   => "\xCB",
285   "Igrave" => "\xCC", "Iacute" => "\xCD", "Icirc"  => "\xCE",
286   "Iuml"   => "\xCF", "ETH"    => "\xD0", "Ntilde" => "\xD1",
287   "Ograve" => "\xD2", "Oacute" => "\xD3", "Ocirc"  => "\xD4",
288   "Otilde" => "\xD5", "Ouml"   => "\xD6", "times"  => "\xD7",
289   "Oslash" => "\xD8", "Ugrave" => "\xD9", "Uacute" => "\xDA",
290   "Ucirc"  => "\xDB", "Uuml"   => "\xDC", "Yacute" => "\xDD",
291   "THORN"  => "\xDE", "szlig"  => "\xDF", "agrave" => "\xE0",
292   "aacute" => "\xE1", "acirc"  => "\xE2", "atilde" => "\xE3",
293   "auml"   => "\xE4", "aring"  => "\xE5", "aelig"  => "\xE6",
294   "ccedil" => "\xE7", "egrave" => "\xE8", "eacute" => "\xE9",
295   "ecirc"  => "\xEA", "euml"   => "\xEB", "igrave" => "\xEC",
296   "iacute" => "\xED", "icirc"  => "\xEE", "iuml"   => "\xEF",
297   "eth"    => "\xF0", "ntilde" => "\xF1", "ograve" => "\xF2",
298   "oacute" => "\xF3", "ocirc"  => "\xF4", "otilde" => "\xF5",
299   "ouml"   => "\xF6", "divide" => "\xF7", "oslash" => "\xF8",
300   "ugrave" => "\xF9", "uacute" => "\xFA", "ucirc"  => "\xFB",
301   "uuml"   => "\xFC", "yacute" => "\xFD", "thorn"  => "\xFE",
302   "yuml"   => "\xFF",
303
304   # HTML 4 entities that do not have 1:1 Latin1 mappings.
305   "bull"  => "*",    "hellip"=> "...",  "prime" => "'",  "Prime" => "\"",
306   "frasl" => "/",    "trade" => "[tm]", "larr"  => "<-", "rarr"  => "->",
307   "harr"  => "<->",  "lArr"  => "<=",   "rArr"  => "=>", "hArr"  => "<=>",
308   "empty" => "\xD8", "minus" => "-",    "lowast"=> "*",  "sim"   => "~",
309   "cong"  => "=~",   "asymp" => "~",    "ne"    => "!=", "equiv" => "==",
310   "le"    => "<=",   "ge"    => ">=",   "lang"  => "<",  "rang"  => ">",
311   "loz"   => "<>",   "OElig" => "OE",   "oelig" => "oe", "Yuml"  => "Y",
312   "circ"  => "^",    "tilde" => "~",    "ensp"  => " ",  "emsp"  => " ",
313   "thinsp"=> " ",    "ndash" => "-",    "mdash" => "--", "lsquo" => "`",
314   "rsquo" => "'",    "sbquo" => "'",    "ldquo" => "\"", "rdquo" => "\"",
315   "bdquo" => "\"",   "lsaquo"=> "<",    "rsaquo"=> ">",
316);
317
318
319##############################################################################
320#
321# Various global flags set by command line parameters, or computed
322#
323##############################################################################
324
325
326my $current_state = "???";      # for diagnostics
327my $load_method;
328my $last_search;
329my $image_succeeded = -1;
330my $suppress_audit = 0;
331
332my $verbose_imgmap = 0;         # print out rectangles and URLs only (stdout)
333my $verbose_warnings = 0;       # print out warnings when things go wrong
334my $verbose_load = 0;           # diagnostics about loading of URLs
335my $verbose_filter = 0;         # diagnostics about page selection/rejection
336my $verbose_net = 0;            # diagnostics about network I/O
337my $verbose_decode = 0;         # diagnostics about img conversion pipelines
338my $verbose_http = 0;           # diagnostics about all HTTP activity
339my $verbose_exec = 0;           # diagnostics about executing programs
340
341my $report_performance_interval = 60 * 15;  # print some stats every 15 minutes
342
343my $http_proxy = undef;
344my $http_timeout = 20;
345my $cvt_timeout = 10;
346
347my $min_width = 50;
348my $min_height = 50;
349my $min_ratio = 1/5;
350
351my $min_gif_area = (120 * 120);
352
353
354my $no_output_p = 0;
355my $urls_only_p = 0;
356my $cocoa_p = 0;
357my $imagemap_base = undef;
358
359my @pids_to_kill = ();  # forked pids we should kill when we exit, if any.
360
361my $driftnet_magic = 'driftnet';
362my $driftnet_dir = undef;
363my $default_driftnet_cmd = "driftnet -a -m 100";
364
365my $local_magic = 'local-directory';
366my $local_dir = undef;
367
368my $wordlist;
369
370my %rejected_urls;
371my @tripwire_words = ("aberrate", "abode", "amorphous", "antioch",
372                      "arrhenius", "arteriole", "blanket", "brainchild",
373                      "burdensome", "carnival", "cherub", "chord", "clever",
374                      "dedicate", "dilogarithm", "dolan", "dryden",
375                      "eggplant");
376
377
378##############################################################################
379#
380# Retrieving URLs
381#
382##############################################################################
383
384# returns three values: the HTTP response line; the document headers;
385# and the document body.
386#
387sub get_document_1($$$) {
388  my ($url, $referer, $timeout) = @_;
389
390  if (!defined($timeout)) { $timeout = $http_timeout; }
391  if ($timeout > $http_timeout) { $timeout = $http_timeout; }
392
393  my $user_agent = "$progname/$version";
394
395  if ($url =~ m@^https?://www\.altavista\.com/@s ||
396      $url =~ m@^https?://random\.yahoo\.com/@s  ||
397      $url =~ m@^https?://[^./]+\.google\.com/@s ||
398      $url =~ m@^https?://www\.livejournal\.com/@s) {
399    # block this, you turkeys.
400    $user_agent = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.7)' .
401                  ' Gecko/20070914 Firefox/2.0.0.7';
402  }
403
404  my $ua = LWP::UserAgent->new ( agent => $user_agent,
405                                 keep_alive => 0,
406                                 env_proxy => 0,
407                               );
408  $ua->proxy ('http', $http_proxy) if $http_proxy;
409  $ua->default_header ('Referer' => $referer) if $referer;
410  $ua->default_header ('Accept' => '*/*');
411  $ua->timeout($timeout) if $timeout;
412
413  if (0) {
414    $ua->add_handler ("request_send",
415                      sub($$$) {
416                        my ($req, $ua, $h) = @_;
417                        print "\n>>[[\n"; $req->dump; print "\n]]\n";
418                        return;
419                      });
420    $ua->add_handler ("response_data",
421                      sub($$$$) {
422                        my ($req, $ua, $h, $data) = @_;
423                        #print "\n<<[[\n"; print $data; print "\n]]\n";
424                        return 1;
425                      });
426    $ua->add_handler ("request_done",
427                      sub($$$) {
428                        my ($req, $ua, $h) = @_;
429                        print "\n<<[[\n"; $req->dump; print "\n]]\n";
430                        return;
431                      });
432  }
433
434  if ($verbose_http) {
435    LOG (1, "  ==> GET $url");
436    LOG (1, "  ==> User-Agent: $user_agent");
437    LOG (1, "  ==> Referer: $referer") if $referer;
438  }
439
440  my $res = $ua->get ($url);
441
442  my $http = ($res ? $res->status_line : '') || '';
443  my $head = ($res ? $res->headers()   : '') || '';
444  $head = $head->as_string() if $head;
445  my $body = ($res && $res->is_success ? $res->decoded_content : '') || '';
446
447  LOG ($verbose_net, "get_document_1 $url " . ($referer ? $referer : ""));
448
449  $head =~ s/\r\n/\n/gs;
450  $head =~ s/\r/\n/gs;
451  if ($verbose_http) {
452    foreach (split (/\n/, $head)) {
453      LOG ($verbose_http, "  <== $_");
454    }
455  }
456
457  my @L = split(/\r\n|\r|\n/, $body);
458  my $lines = @L;
459  LOG ($verbose_http,
460       "  <== [ body ]: $lines lines, " . length($body) . " bytes");
461
462  if (!$http) {
463    LOG (($verbose_net || $verbose_load), "null response: $url");
464    return ();
465  }
466
467  return ( $http, $head, $body );
468}
469
470
471# returns two values: the document headers; and the document body.
472# if the given URL did a redirect, returns the redirected-to document.
473#
474sub get_document($$;$) {
475  my ($url, $referer, $timeout) = @_;
476  my $start = time;
477
478  if (defined($referer) && $referer eq $driftnet_magic) {
479    return get_driftnet_file ($url);
480  }
481
482  if (defined($referer) && $referer eq $local_magic) {
483    return get_local_file ($url);
484  }
485
486  my $orig_url = $url;
487  my $loop_count = 0;
488  my $max_loop_count = 4;
489
490  do {
491    if (defined($timeout) && $timeout <= 0) {
492      LOG (($verbose_net || $verbose_load), "timed out for $url");
493      $suppress_audit = 1;
494      return ();
495    }
496
497    my ( $http, $head, $body ) = get_document_1 ($url, $referer, $timeout);
498
499    if (defined ($timeout)) {
500      my $now = time;
501      my $elapsed = $now - $start;
502      $timeout -= $elapsed;
503      $start = $now;
504    }
505
506    return () unless $http; # error message already printed
507
508    $http =~ s/[\r\n]+$//s;
509
510    if ( $http =~ m@^HTTP/[0-9.]+ 30[123]@ ) {
511      $_ = $head;
512
513      my ( $location ) = m@^location:[ \t]*(.*)$@im;
514      if ( $location ) {
515        $location =~ s/[\r\n]$//;
516
517        LOG ($verbose_net, "redirect from $url to $location");
518        $referer = $url;
519        $url = $location;
520
521        if ($url =~ m@^/@) {
522          $referer =~ m@^(https?://[^/]+)@i;
523          $url = $1 . $url;
524        } elsif (! ($url =~ m@^[a-z]+:@i)) {
525          $_ = $referer;
526          s@[^/]+$@@g if m@^https?://[^/]+/@i;
527          $_ .= "/" if m@^https?://[^/]+$@i;
528          $url = $_ . $url;
529        }
530
531      } else {
532        LOG ($verbose_net, "no Location with \"$http\"");
533        return ( $url, $body );
534      }
535
536      if ($loop_count++ > $max_loop_count) {
537        LOG ($verbose_net,
538             "too many redirects ($max_loop_count) from $orig_url");
539        $body = undef;
540        return ();
541      }
542
543    } elsif ( $http =~ m@^HTTP/[0-9.]+ ([4-9][0-9][0-9].*)$@ ) {
544
545      LOG (($verbose_net || $verbose_load), "failed: $1 ($url)");
546
547      # http errors -- return nothing.
548      $body = undef;
549      return ();
550
551    } elsif (!$body) {
552
553      LOG (($verbose_net || $verbose_load), "document contains no data: $url");
554      return ();
555
556    } else {
557
558      # ok!
559      return ( $url, $body );
560    }
561
562  } while (1);
563}
564
565# If we already have a cookie defined for this site, and the site is trying
566# to overwrite that very same cookie, let it do so.  This is because nytimes
567# expires its cookies - it lets you upgrade to a new cookie without logging
568# in again, but you have to present the old cookie to get the new cookie.
569# So, by doing this, the built-in cypherpunks cookie will never go "stale".
570#
571sub set_cookie($$) {
572  my ($host, $cookie) = @_;
573  my $oc = $cookies{$host};
574  return unless $oc;
575  $_ = $oc;
576  my ($oc_name, $oc_value) = m@^([^= \t\r\n]+)=(.*)$@;
577  $_ = $cookie;
578  my ($nc_name, $nc_value) = m@^([^= \t\r\n]+)=(.*)$@;
579
580  if ($oc_name eq $nc_name &&
581      $oc_value ne $nc_value) {
582    $cookies{$host} = $cookie;
583    LOG ($verbose_net, "overwrote ${host}'s $oc_name cookie");
584  }
585}
586
587
588############################################################################
589#
590# Extracting image URLs from HTML
591#
592############################################################################
593
594# given a URL and the body text at that URL, selects and returns a random
595# image from it.  returns () if no suitable images found.
596#
597sub pick_image_from_body($$) {
598  my ($url, $body) = @_;
599
600  my $base = $url;
601  $_ = $url;
602
603  # if there's at least one slash after the host, take off the last
604  # pathname component
605  if ( m@^https?://[^/]+/@io ) {
606    $base =~ s@[^/]+$@@go;
607  }
608
609  # if there are no slashes after the host at all, put one on the end.
610  if ( m@^https?://[^/]+$@io ) {
611    $base .= "/";
612  }
613
614  $_ = $body;
615
616  # strip out newlines, compress whitespace
617  s/[\r\n\t ]+/ /go;
618
619  # nuke comments
620  s/<!--.*?-->//go;
621
622
623  # There are certain web sites that list huge numbers of dictionary
624  # words in their bodies or in their <META NAME=KEYWORDS> tags (surprise!
625  # Porn sites tend not to be reputable!)
626  #
627  # I do not want webcollage to filter on content: I want it to select
628  # randomly from the set of images on the web.  All the logic here for
629  # rejecting some images is really a set of heuristics for rejecting
630  # images that are not really images: for rejecting *text* that is in
631  # GIF/JPEG/PNG form.  I don't want text, I want pictures, and I want
632  # the content of the pictures to be randomly selected from among all
633  # the available content.
634  #
635  # So, filtering out "dirty" pictures by looking for "dirty" keywords
636  # would be wrong: dirty pictures exist, like it or not, so webcollage
637  # should be able to select them.
638  #
639  # However, picking a random URL is a hard thing to do.  The mechanism I'm
640  # using is to search for a selection of random words.  This is not
641  # perfect, but works ok most of the time.  The way it breaks down is when
642  # some URLs get precedence because their pages list *every word* as
643  # related -- those URLs come up more often than others.
644  #
645  # So, after we've retrieved a URL, if it has too many keywords, reject
646  # it.  We reject it not on the basis of what those keywords are, but on
647  # the basis that by having so many, the page has gotten an unfair
648  # advantage against our randomizer.
649  #
650  my $trip_count = 0;
651  foreach my $trip (@tripwire_words) {
652    $trip_count++ if m/$trip/i;
653  }
654
655  if ($trip_count >= $#tripwire_words - 2) {
656    LOG (($verbose_filter || $verbose_load),
657         "there is probably a dictionary in \"$url\": rejecting.");
658    $rejected_urls{$url} = -1;
659    $body = undef;
660    $_ = undef;
661    return ();
662  }
663
664
665  my @urls;
666  my %unique_urls;
667
668  foreach (split(/ *</)) {
669    if ( m/^meta.*["']keywords["']/i ) {
670
671      # Likewise, reject any web pages that have a KEYWORDS meta tag
672      # that is too long.
673      #
674      my $L = length($_);
675      if ($L > 1000) {
676        LOG (($verbose_filter || $verbose_load),
677             "excessive keywords ($L bytes) in $url: rejecting.");
678        $rejected_urls{$url} = $L;
679        $body = undef;
680        $_ = undef;
681        return ();
682      } else {
683        LOG ($verbose_filter, "  keywords ($L bytes) in $url (ok)");
684      }
685
686    } elsif (m/^ (IMG|A) \b .* (SRC|HREF) \s* = \s* ["']? (.*?) [ "'<>] /six ||
687             m/^ (LINK|META) \b .* (REL|PROPERTY) \s* = \s*
688                 ["']? (image_src|og:image) ["']? /six) {
689
690      my $was_inline = (lc($1) eq 'img');
691      my $was_meta   = (lc($1) eq 'link' || lc($1) eq 'meta');
692      my $link = $3;
693
694      # For <link rel="image_src" href="...">
695      # and <meta property="og:image" content="...">
696      #
697      if ($was_meta) {
698        next unless (m/ (HREF|CONTENT) \s* = \s* ["']? (.*?) [ "'<>] /six);
699        $link = $2;
700      }
701
702      my ( $width )  = m/width ?=[ \"]*(\d+)/oi;
703      my ( $height ) = m/height ?=[ \"]*(\d+)/oi;
704      $_ = $link;
705
706      if ( m@^/@o ) {
707        my $site;
708        ( $site = $base ) =~ s@^(https?://[^/]*).*@$1@gio;
709        $_ = "$site$link";
710      } elsif ( ! m@^[^/:?]+:@ ) {
711        $_ = "$base$link";
712        s@/\./@/@g;
713        1 while (s@/[^/]+/\.\./@/@g);
714      }
715
716      # skip non-http
717      if ( ! m@^https?://@io ) {
718        next;
719      }
720
721      # skip non-image
722      if ( ! m@[.](gif|jpg|jpeg|pjpg|pjpeg|png)$@io ) {
723        next;
724      }
725
726      # skip really short or really narrow images
727      if ( $width && $width < $min_width) {
728        if (!$height) { $height = "?"; }
729        LOG ($verbose_filter, "  skip narrow image $_ (${width}x$height)");
730        next;
731      }
732
733      if ( $height && $height < $min_height) {
734        if (!$width) { $width = "?"; }
735        LOG ($verbose_filter, "  skip short image $_ (${width}x$height)");
736        next;
737      }
738
739      # skip images with ratios that make them look like banners.
740      if ($min_ratio && $width && $height &&
741          ($width * $min_ratio ) > $height) {
742        if (!$height) { $height = "?"; }
743        LOG ($verbose_filter, "  skip bad ratio $_ (${width}x$height)");
744        next;
745      }
746
747      # skip GIFs with a small number of pixels -- those usually suck.
748      if ($width && $height &&
749          m/\.gif$/io &&
750          ($width * $height) < $min_gif_area) {
751        LOG ($verbose_filter, "  skip small GIF $_ (${width}x$height)");
752        next;
753      }
754
755      # skip images with a URL that indicates a Yahoo thumbnail.
756      if (m@\.yimg\.com/.*/t/@) {
757        if (!$width)  { $width  = "?"; }
758        if (!$height) { $height = "?"; }
759        LOG ($verbose_filter, "  skip yahoo thumb $_ (${width}x$height)");
760        next;
761      }
762
763      my $url = $_;
764
765      if ($unique_urls{$url}) {
766        LOG ($verbose_filter, "  skip duplicate image $_");
767        next;
768      }
769
770      LOG ($verbose_filter,
771           "  image $url" .
772           ($width && $height ? " (${width}x${height})" : "") .
773           ($was_meta ? " (meta)" : $was_inline ? " (inline)" : ""));
774
775
776      my $weight = 1;
777
778      if ($was_meta) {
779        $weight = 20;	 # meta tag images are far preferable to inline images.
780      } else {
781        if ($url !~ m@[.](gif|png)$@io ) {
782          $weight += 2;	 # JPEGs are preferable to GIFs and PNGs.
783        }
784        if (! $was_inline) {
785          $weight += 4;	 # pointers to images are preferable to inlined images.
786        }
787      }
788
789      $unique_urls{$url}++;
790      for (my $i = 0; $i < $weight; $i++) {
791        $urls[++$#urls] = $url;
792      }
793    }
794  }
795
796  my $fsp = ($body =~ m@<frameset@i);
797
798  $_ = undef;
799  $body = undef;
800
801  @urls = depoison (@urls);
802
803  if ( $#urls < 0 ) {
804    LOG ($verbose_load, "no images on $base" . ($fsp ? " (frameset)" : ""));
805    return ();
806  }
807
808  # pick a random element of the table
809  my $i = int(rand($#urls+1));
810  $url = $urls[$i];
811
812  LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#urls+1) . ": $url");
813
814  return $url;
815}
816
817
818# Given a URL and the RSS feed from that URL, pick a random image from
819# the feed.  This is a lot simpler than extracting images out of a page:
820# we already know we have reasonable images, so we just pick one.
821# Returns: the real URL of the page (preferably not the RSS version),
822# and the image.
823
824sub pick_image_from_rss($$) {
825  my ($url, $body) = @_;
826
827  my ($base) = ($body =~ m@<link>([^<>]+)</link>@si);	# root link
828
829  my @items = ($body =~ m@<item\b[^<>]*>(.*?)</item>@gsi);
830  return unless @items;
831
832  my $n = @items;
833  my $i = int(rand($n));
834  my $item = $items[$i];
835
836  $base = $1 if ($item =~ m@<link>([^<>]+)</link>@si);	# item link
837  $base = $url unless $base;
838
839  ($url) = ($item =~ m/<enclosure\b[^<>]*\burl="(.*?)"/si);
840  return unless $url;
841
842  LOG ($verbose_load, "picked image $i/$n: $url");
843  return ($base, $url);
844}
845
846
847############################################################################
848#
849# Subroutines for getting pages and images out of search engines
850#
851############################################################################
852
853
854sub pick_dictionary() {
855  my @dicts = ("/usr/dict/words",
856               "/usr/share/dict/words",
857               "/usr/share/lib/dict/words",
858               "/usr/share/dict/cracklib-small",
859               "/usr/share/dict/cracklib-words"
860               );
861  foreach my $f (@dicts) {
862    if (-f $f) {
863      $wordlist = $f;
864      last;
865    }
866  }
867  error ("$dicts[0] does not exist") unless defined($wordlist);
868}
869
870# returns a random word from the dictionary
871#
872sub random_word() {
873
874  return undef unless open (my $in, '<', $wordlist);
875
876  my $size = (stat($in))[7];
877  my $word = undef;
878  my $count = 0;
879
880  while (1) {
881    error ("looping ($count) while reading $wordlist")
882      if (++$count > 100);
883
884    my $pos = int (rand ($size));
885    if (seek ($in, $pos, 0)) {
886      $word = <$in>;   # toss partial line
887      $word = <$in>;   # keep next line
888    }
889
890    next unless ($word);
891    next if ($word =~ m/^[-\']/);
892
893    $word = lc($word);
894    $word =~ s/^.*-//s;
895    $word =~ s/^[^a-z]+//s;
896    $word =~ s/[^a-z]+$//s;
897    $word =~ s/\'s$//s;
898    $word =~ s/ys$/y/s;
899    $word =~ s/ally$//s;
900    $word =~ s/ly$//s;
901    $word =~ s/ies$/y/s;
902    $word =~ s/ally$/al/s;
903    $word =~ s/izes$/ize/s;
904    $word =~ s/esses$/ess/s;
905    $word =~ s/(.{5})ing$/$1/s;
906
907    next if (length ($word) > 14);
908    last if ($word);
909  }
910
911  close ($in);
912
913  if ( $word =~ s/\s/\+/gs ) {  # convert intra-word spaces to "+".
914    $word = "\%22$word\%22";    # And put quotes (%22) around it.
915  }
916
917  return $word;
918}
919
920
921sub random_words($) {
922  my ($sep) = @_;
923  return (random_word() . $sep .
924          random_word() . $sep .
925          random_word() . $sep .
926          random_word() . $sep .
927          random_word());
928}
929
930
931sub url_quote($) {
932  my ($s) = @_;
933  $s =~ s|([^-a-zA-Z0-9.\@/_\r\n])|sprintf("%%%02X", ord($1))|ge;
934  return $s;
935}
936
937sub url_unquote($) {
938  my ($s) = @_;
939  $s =~ s/[+]/ /g;
940  $s =~ s/%([a-z0-9]{2})/chr(hex($1))/ige;
941  return $s;
942}
943
944sub html_quote($) {
945  my ($s) = @_;
946  $s =~ s/&/&amp;/gi;
947  $s =~ s/</&lt;/gi;
948  $s =~ s/>/&gt;/gi;
949  $s =~ s/\"/&quot;/gi;
950  return $s;
951}
952
953sub html_unquote($) {
954  my ($s) = @_;
955  $s =~ s/(&([a-z]+);)/{ $entity_table{$2} || $1; }/gexi;  # e.g., &apos;
956  $s =~ s/(&\#(\d+);)/{ chr($2) }/gexi;                    # e.g., &#39;
957  return $s;
958}
959
960
961# Loads the given URL (a search on some search engine) and returns:
962# - the total number of hits the search engine claimed it had;
963# - a list of URLs from the page that the search engine returned;
964# Note that this list contains all kinds of internal search engine
965# junk URLs too -- caller must prune them.
966#
967sub pick_from_search_engine($$$) {
968  my ( $timeout, $search_url, $words ) = @_;
969
970  $_ = $words;
971  s/%20/ /g;
972
973  print STDERR "\n\n" if ($verbose_load);
974
975  LOG ($verbose_load, "words: $_");
976  LOG ($verbose_load, "URL: $search_url");
977
978  $last_search = $search_url;   # for warnings
979
980  my $start = time;
981  my ( $base, $body ) = get_document ($search_url, undef, $timeout);
982  if (defined ($timeout)) {
983    $timeout -= (time - $start);
984    if ($timeout <= 0) {
985      $body = undef;
986      LOG (($verbose_net || $verbose_load),
987           "timed out (late) for $search_url");
988      $suppress_audit = 1;
989      return ();
990    }
991  }
992
993  return () if (! $body);
994
995
996  my @subpages;
997
998  my $search_count = "?";
999  if ($body =~ m@found (approximately |about )?(<B>)?(\d+)(</B>)? image@) {
1000    $search_count = $3;
1001  } elsif ($body =~ m@<NOBR>((\d{1,3})(,\d{3})*)&nbsp;@i) {
1002    $search_count = $1;
1003  } elsif ($body =~ m@found ((\d{1,3})(,\d{3})*|\d+) Web p@) {
1004    $search_count = $1;
1005  } elsif ($body =~ m@found about ((\d{1,3})(,\d{3})*|\d+) results@) {
1006    $search_count = $1;
1007  } elsif ($body =~ m@\b\d+ - \d+ of (\d+)\b@i) { # avimages
1008    $search_count = $1;
1009  } elsif ($body =~ m@About ((\d{1,3})(,\d{3})*) images@i) { # avimages
1010    $search_count = $1;
1011  } elsif ($body =~ m@We found ((\d{1,3})(,\d{3})*|\d+) results@i) { # *vista
1012    $search_count = $1;
1013  } elsif ($body =~ m@ of about <B>((\d{1,3})(,\d{3})*)<@i) { # googleimages
1014    $search_count = $1;
1015  } elsif ($body =~ m@<B>((\d{1,3})(,\d{3})*)</B> Web sites were found@i) {
1016    $search_count = $1;    # lycos
1017  } elsif ($body =~ m@WEB.*?RESULTS.*?\b((\d{1,3})(,\d{3})*)\b.*?Matches@i) {
1018    $search_count = $1;                          # hotbot
1019  } elsif ($body =~ m@no photos were found containing@i) { # avimages
1020    $search_count = "0";
1021  } elsif ($body =~ m@found no document matching@i) { # avtext
1022    $search_count = "0";
1023  }
1024  1 while ($search_count =~ s/^(\d+)(\d{3})/$1,$2/);
1025
1026#  if ($search_count eq "?" || $search_count eq "0") {
1027#    my $file = "/tmp/wc.html";
1028#    open (my $out, '>', $file) || error ("writing $file: $!");
1029#    print $out $body;
1030#    close $out;
1031#    print STDERR  blurb() . "###### wrote $file\n";
1032#  }
1033
1034
1035  my $length = length($body);
1036  my $href_count = 0;
1037
1038  $_ = $body;
1039
1040  s/[\r\n\t ]+/ /g;
1041
1042
1043  s/(<A )/\n$1/gi;
1044  foreach (split(/\n/)) {
1045    $href_count++;
1046    my ($u) = m@<A\s.*?\bHREF\s*=\s*([\"\'][^\"\'<>]+)@i;
1047    next unless $u;
1048    my ($u2) = m@<IMG\s.*\bSRC\s*=\s*[\"\']([^\"\'<>]+)@i;
1049
1050    if (m/\bm="\{(.*?)\}"/s) {		# Bing info is inside JSON crud
1051      my $json = html_unquote($1);
1052      my ($href) = ($json =~ m/\b(?:surl|purl)\"?:\s*"(.*?)"/s);
1053      my ($img)  = ($json =~ m/\b(?:imgurl|murl)\"?:\s*"(.*?)"/s);
1054      $u = "$img\t$href" if ($img && $href);
1055
1056    } elsif ($u2 && $u2 =~ m@://[^/]*\.gstatic\.com/@s) { $u = $u2;
1057                                                          $u =~ s/^\"|\"$//s;
1058
1059    } elsif ($u =~ m/^\"([^\"]*)\"/) { $u = $1   # quoted string
1060    } elsif ($u =~ m/^([^\s]*)\s/) { $u = $1;    # or token
1061    }
1062
1063    if ( $rejected_urls{$u} ) {
1064      LOG ($verbose_filter, "  pre-rejecting candidate: $u");
1065      next;
1066    }
1067
1068    LOG ($verbose_http, "    HREF: $u");
1069
1070    $subpages[++$#subpages] = $u;
1071  }
1072
1073  if ( $#subpages < 0 ) {
1074    LOG ($verbose_filter,
1075         "found nothing on $base ($length bytes, $href_count links).");
1076    return ();
1077  }
1078
1079  LOG ($verbose_filter, "" . $#subpages+1 . " links on $search_url");
1080
1081  return ($search_count, @subpages);
1082}
1083
1084
1085sub depoison(@) {
1086  my (@urls) = @_;
1087  my @urls2 = ();
1088  foreach (@urls) {
1089    my ($h) = m@^https?://([^/: \t\r\n]+)@i;
1090
1091    next unless defined($h);
1092
1093    if ($poisoners{$h}) {
1094      LOG (($verbose_filter), "  rejecting poisoner: $_");
1095      next;
1096    }
1097    if ($h =~ m@([^.]+\.[^.]+\.[^.]+)$@ &&
1098        $poisoners{$1}) {
1099      LOG (($verbose_filter), "  rejecting poisoner: $_");
1100      next;
1101    }
1102    if ($h =~ m@([^.]+\.[^.]+)$@ &&
1103        $poisoners{$1}) {
1104      LOG (($verbose_filter), "  rejecting poisoner: $_");
1105      next;
1106    }
1107
1108    push @urls2, $_;
1109  }
1110  return @urls2;
1111}
1112
1113
1114# given a list of URLs, picks one at random; loads it; and returns a
1115# random image from it.
1116# returns the url of the page loaded; the url of the image chosen.
1117#
1118sub pick_image_from_pages($$$$@) {
1119  my ($base, $total_hit_count, $unfiltered_link_count, $timeout, @pages) = @_;
1120
1121  $total_hit_count = "?" unless defined($total_hit_count);
1122
1123  @pages = depoison (@pages);
1124  LOG ($verbose_load,
1125       "" . ($#pages+1) . " candidates of $unfiltered_link_count links" .
1126       " ($total_hit_count total)");
1127
1128  return () if ($#pages < 0);
1129
1130  my $i = int(rand($#pages+1));
1131  my $page = $pages[$i];
1132
1133  LOG ($verbose_load, "picked page $page");
1134
1135  $suppress_audit = 1;
1136
1137  my ( $base2, $body2 ) = get_document ($page, $base, $timeout);
1138
1139  if (!$base2 || !$body2) {
1140    $body2 = undef;
1141    return ();
1142  }
1143
1144  my $img = pick_image_from_body ($base2, $body2);
1145  $body2 = undef;
1146
1147  if ($img) {
1148    return ($base2, $img);
1149  } else {
1150    return ();
1151  }
1152}
1153
1154
1155#############################################################################
1156##
1157## Pick images from random pages returned by the Yahoo Random Link
1158##
1159#############################################################################
1160#
1161## yahoorand
1162#my $yahoo_random_link = "http://random.yahoo.com/fast/ryl";
1163#
1164#
1165# Picks a random page; picks a random image on that page;
1166# returns two URLs: the page containing the image, and the image.
1167# Returns () if nothing found this time.
1168#
1169#sub pick_from_yahoo_random_link($) {
1170#  my ($timeout) = @_;
1171#
1172#  print STDERR "\n\n" if ($verbose_load);
1173#  LOG ($verbose_load, "URL: $yahoo_random_link");
1174#
1175#  $last_search = $yahoo_random_link;   # for warnings
1176#
1177#  $suppress_audit = 1;
1178#
1179#  my ( $base, $body ) = get_document ($yahoo_random_link, undef, $timeout);
1180#  if (!$base || !$body) {
1181#    $body = undef;
1182#    return;
1183#  }
1184#
1185#  LOG ($verbose_load, "redirected to: $base");
1186#
1187#  my $img = pick_image_from_body ($base, $body);
1188#  $body = undef;
1189#
1190#  if ($img) {
1191#    return ($base, $img);
1192#  } else {
1193#    return ();
1194#  }
1195#}
1196
1197
1198############################################################################
1199#
1200# Pick images from random pages returned by the Alta Vista Random Link
1201# Note: this seems to have gotten a *lot* less random lately (2007).
1202#
1203############################################################################
1204
1205# altavista
1206my $alta_vista_random_link = "http://www.altavista.com/image/randomlink";
1207
1208
1209# Picks a random page; picks a random image on that page;
1210# returns two URLs: the page containing the image, and the image.
1211# Returns () if nothing found this time.
1212#
1213sub pick_from_alta_vista_random_link($) {
1214  my ($timeout) = @_;
1215
1216  print STDERR "\n\n" if ($verbose_load);
1217  LOG ($verbose_load, "URL: $alta_vista_random_link");
1218
1219  $last_search = $alta_vista_random_link;   # for warnings
1220
1221  $suppress_audit = 1;
1222
1223  my ( $base, $body ) = get_document ($alta_vista_random_link,
1224                                      undef, $timeout);
1225  if (!$base || !$body) {
1226    $body = undef;
1227    return;
1228  }
1229
1230  LOG ($verbose_load, "redirected to: $base");
1231
1232  my $img = pick_image_from_body ($base, $body);
1233  $body = undef;
1234
1235  if ($img) {
1236    return ($base, $img);
1237  } else {
1238    return ();
1239  }
1240}
1241
1242
1243############################################################################
1244#
1245# Pick images by feeding random words into Alta Vista Image Search
1246#
1247############################################################################
1248
1249
1250my $alta_vista_images_url = "http://www.altavista.com/image/results" .
1251                            "?ipht=1" .       # photos
1252                            "&igrph=1" .      # graphics
1253                            "&iclr=1" .       # color
1254                            "&ibw=1" .        # b&w
1255                            "&micat=1" .      # no partner sites
1256                            "&sc=on" .        # "site collapse"
1257                            "&q=";
1258
1259# avimages
1260sub pick_from_alta_vista_images($) {
1261  my ($timeout) = @_;
1262
1263  my $words = random_word();
1264  my $page = (int(rand(9)) + 1);
1265  my $search_url = $alta_vista_images_url . $words;
1266
1267  if ($page > 1) {
1268    $search_url .= "&pgno=" . $page;		# page number
1269    $search_url .= "&stq=" . (($page-1) * 12);  # first hit result on page
1270  }
1271
1272  my ($search_hit_count, @subpages) =
1273    pick_from_search_engine ($timeout, $search_url, $words);
1274
1275  my @candidates = ();
1276  foreach my $u (@subpages) {
1277
1278    # avimages is encoding their URLs now.
1279    next unless ($u =~ s/^.*\*\*(http%3a.*$)/$1/gsi);
1280    $u = url_unquote($u);
1281
1282    next unless ($u =~ m@^https?://@i);    #  skip non-HTTP or relative URLs
1283    next if ($u =~ m@[/.]altavista\.com\b@i);     # skip altavista builtins
1284    next if ($u =~ m@[/.]yahoo\.com\b@i);         # yahoo and av in cahoots?
1285    next if ($u =~ m@[/.]doubleclick\.net\b@i);   # you cretins
1286    next if ($u =~ m@[/.]clicktomarket\.com\b@i); # more cretins
1287
1288    next if ($u =~ m@[/.]viewimages\.com\b@i);    # stacked deck
1289    next if ($u =~ m@[/.]gettyimages\.com\b@i);
1290
1291    LOG ($verbose_filter, "  candidate: $u");
1292    push @candidates, $u;
1293  }
1294
1295  return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1296                                $timeout, @candidates);
1297}
1298
1299
1300
1301############################################################################
1302#
1303# Pick images from Aptix security cameras
1304# Cribbed liberally from google image search code.
1305# By Jason Sullivan <jasonsul@us.ibm.com>
1306#
1307############################################################################
1308
1309my $aptix_images_url = ("http://www.google.com/search" .
1310                        "?q=inurl:%22jpg/image.jpg%3Fr%3D%22");
1311
1312# securitycam
1313sub pick_from_security_camera($) {
1314  my ($timeout) = @_;
1315
1316  my $page = (int(rand(9)) + 1);
1317  my $num = 20;					# 20 images per page
1318  my $search_url = $aptix_images_url;
1319
1320  if ($page > 1) {
1321    $search_url .= "&start=" . $page*$num;	# page number
1322    $search_url .= "&num="   . $num;		#images per page
1323  }
1324
1325  my ($search_hit_count, @subpages) =
1326    pick_from_search_engine ($timeout, $search_url, '');
1327
1328  my @candidates = ();
1329  my %referers;
1330  foreach my $u (@subpages) {
1331    next if ($u =~ m@[/.]google\.com\b@i); # skip google builtins (most links)
1332    next unless ($u =~ m@jpg/image.jpg\?r=@i);    #  All pics contain this
1333
1334    LOG ($verbose_filter, "  candidate: $u");
1335    push @candidates, $u;
1336    $referers{$u} = $u;
1337    }
1338
1339  @candidates = depoison (@candidates);
1340  return () if ($#candidates < 0);
1341  my $i = int(rand($#candidates+1));
1342  my $img = $candidates[$i];
1343  my $ref = $referers{$img};
1344
1345  LOG ($verbose_load, "picked image " . ($i+1) . ": $img (on $ref)");
1346  return ($ref, $img);
1347}
1348
1349
1350############################################################################
1351#
1352# Pick images by feeding random words into Google Image Search.
1353# By Charles Gales <gales@us.ibm.com>
1354#
1355############################################################################
1356
1357my $google_images_url = 'https://www.google.com/search' .
1358			'?source=lnms&tbm=isch&tbs=isz:l&q=';
1359
1360# googleimgs
1361sub pick_from_google_images($;$$) {
1362  my ($timeout, $words, $max_page) = @_;
1363
1364  if (!defined($words)) {
1365    $words = random_word();   # only one word for Google
1366  }
1367
1368  my $off = int(rand(40));
1369  my $search_url = $google_images_url . $words . "&start=" . $off;
1370
1371  my ($search_hit_count, @subpages) =
1372    pick_from_search_engine ($timeout, $search_url, $words);
1373
1374  my @candidates = ();
1375  foreach my $u (@subpages) {
1376    $u = html_unquote($u);
1377    # next if ($u =~ m@^https?://[^.]*\.(google|youtube)\.com/@s);
1378    next unless ($u =~ m@^https?://[^/]*\.gstatic\.com@s);
1379    LOG ($verbose_filter, "  candidate: $u");
1380    push @candidates, $u;
1381  }
1382
1383  @candidates = depoison (@candidates);
1384  return () if ($#candidates < 0);
1385  my $i = int(rand($#candidates+1));
1386  my $img = $candidates[$i];
1387
1388  LOG ($verbose_load, "picked image " . ($i+1) . ": $img");
1389  return ($img, $img);
1390}
1391
1392
1393
1394############################################################################
1395#
1396# Pick images by feeding random numbers into Google Image Search.
1397# By jwz, suggested by Ian O'Donnell.
1398#
1399############################################################################
1400
1401
1402# googlenums
1403sub pick_from_google_image_numbers($) {
1404  my ($timeout) = @_;
1405
1406  my $max = 9999;
1407  my $number = int(rand($max));
1408
1409  $number = sprintf("%04d", $number)
1410    if (rand() < 0.3);
1411
1412  pick_from_google_images ($timeout, "$number");
1413}
1414
1415
1416
1417############################################################################
1418#
1419# Pick images by feeding random digital camera file names into
1420# Google Image Search.
1421# By jwz, inspired by the excellent Random Personal Picture Finder
1422# at http://www.diddly.com/random/
1423# May 2017: Commented out a bunch of formats that have fallen out of favor.
1424#
1425############################################################################
1426
1427my @photomakers = (
1428  #
1429  # Common digital camera file name formats, as described at
1430  # http://www.diddly.com/random/about.html
1431  #
1432# sub { sprintf ("dcp%05d.jpg",  int(rand(4000))); },	# Kodak
1433  sub { sprintf ("dsc%05d.jpg",  int(rand(4000))); },	# Nikon
1434  sub { sprintf ("dscn%04d.jpg", int(rand(4000))); },	# Nikon
1435# sub { sprintf ("mvc-%03d.jpg", int(rand(999)));  },	# Sony Mavica
1436# sub { sprintf ("mvc%05d.jpg",  int(rand(9999))); },	# Sony Mavica
1437# sub { sprintf ("P101%04d.jpg", int(rand(9999))); },	# Olympus w/ date=101
1438# sub { sprintf ("P%x%02d%04d.jpg",			# Olympus
1439#                int(rand(0xC)), int(rand(30))+1,
1440#                rand(9999)); },
1441  sub { sprintf ("IMG_%03d.jpg",  int(rand(999))); },	# ?
1442# sub { sprintf ("IMAG%04d.jpg",  int(rand(9999))); },	# RCA and Samsung
1443# sub { my $n = int(rand(9999));			# Canon
1444#         sprintf ("1%02d-%04d.jpg", int($n/100), $n); },
1445# sub { my $n = int(rand(9999));			# Canon
1446#         sprintf ("1%02d-%04d_IMG.jpg",
1447#                  int($n/100), $n); },
1448  sub { sprintf ("IMG_%04d.jpg", int(rand(9999))); },	# Canon
1449  sub { sprintf ("dscf%04d.jpg", int(rand(9999))); },	# Fuji Finepix
1450# sub { sprintf ("pdrm%04d.jpg", int(rand(9999))); },	# Toshiba PDR
1451# sub { sprintf ("IM%06d.jpg", int(rand(9999))); },	# HP Photosmart
1452# sub { sprintf ("EX%06d.jpg", int(rand(9999))); },	# HP Photosmart
1453#  sub { my $n = int(rand(3));				# Kodak DC-40,50,120
1454#        sprintf ("DC%04d%s.jpg", int(rand(9999)),
1455#                 $n == 0 ? 'S' : $n == 1 ? 'M' : 'L'); },
1456  sub { sprintf ("pict%04d.jpg", int(rand(9999))); },	# Minolta Dimage
1457# sub { sprintf ("P%07d.jpg", int(rand(9999))); },	# Kodak DC290
1458#  sub { sprintf ("%02d%02d%04d.jpg",			# Casio QV3000, QV4000
1459#                 int(rand(12))+1, int(rand(31))+1,
1460#                 int(rand(999))); },
1461#  sub { sprintf ("%02d%x%02d%04d.jpg",			# Casio QV7000
1462#                 int(rand(6)), # year
1463#                 int(rand(12))+1, int(rand(31))+1,
1464#                 int(rand(999))); },
1465  sub { sprintf ("IMGP%04d.jpg", int(rand(9999))); },	# Pentax Optio S
1466# sub { sprintf ("PANA%04d.jpg", int(rand(9999))); },	# Panasonic vid still
1467  sub { sprintf ("HPIM%04d.jpg", int(rand(9999))); },	# HP Photosmart
1468# sub { sprintf ("PCDV%04d.jpg", int(rand(9999))); },	# ?
1469 );
1470
1471
1472# googlephotos
1473sub pick_from_google_image_photos($) {
1474  my ($timeout) = @_;
1475
1476  my $i = int(rand($#photomakers + 1));
1477  my $fn = $photomakers[$i];
1478  my $file = &$fn;
1479  #$file .= "%20filetype:jpg";
1480
1481  pick_from_google_images ($timeout, $file);
1482}
1483
1484
1485############################################################################
1486#
1487# Pick images by feeding random words into Google Image Search.
1488# By the way: fuck Microsoft.
1489#
1490############################################################################
1491
1492my $bing_images_url =	"http://www.bing.com/images/async?q=";
1493
1494
1495# bingimgs
1496sub pick_from_bing_images($;$$) {
1497  my ($timeout, $words, $max_page) = @_;
1498
1499  if (!defined($words)) {
1500    $words = random_word();   # only one word for Bing
1501  }
1502
1503  my $off = int(rand(300));
1504  my $search_url = $bing_images_url . $words . "&first=" . $off;
1505
1506  my ($search_hit_count, @subpages) =
1507    pick_from_search_engine ($timeout, $search_url, $words);
1508
1509  my @candidates = ();
1510  my %referers;
1511  foreach my $u (@subpages) {
1512    my ($img, $ref) = ($u =~ m/^(.*?)\t(.*)$/s);
1513    next unless $img;
1514    LOG ($verbose_filter, "  candidate: $ref");
1515    push @candidates, $img;
1516    $referers{$img} = $ref;
1517  }
1518
1519  @candidates = depoison (@candidates);
1520  return () if ($#candidates < 0);
1521  my $i = int(rand($#candidates+1));
1522  my $img = $candidates[$i];
1523  my $ref = $referers{$img};
1524
1525  LOG ($verbose_load, "picked image " . ($i+1) . ": $img (on $ref)");
1526  return ($ref, $img);
1527}
1528
1529
1530
1531
1532############################################################################
1533#
1534# Pick images by feeding random numbers into Bing Image Search.
1535#
1536############################################################################
1537
1538# bingnums
1539sub pick_from_bing_image_numbers($) {
1540  my ($timeout) = @_;
1541
1542  my $max = 9999;
1543  my $number = int(rand($max));
1544
1545  $number = sprintf("%04d", $number)
1546    if (rand() < 0.3);
1547
1548  pick_from_bing_images ($timeout, "$number");
1549}
1550
1551
1552############################################################################
1553#
1554# Pick images by feeding random numbers into Bing Image Search.
1555#
1556############################################################################
1557
1558# bingphotos
1559sub pick_from_bing_image_photos($) {
1560  my ($timeout) = @_;
1561
1562  my $i = int(rand($#photomakers + 1));
1563  my $fn = $photomakers[$i];
1564  my $file = &$fn;
1565
1566  pick_from_bing_images ($timeout, $file);
1567}
1568
1569
1570############################################################################
1571#
1572# Pick images by feeding random words into Alta Vista Text Search
1573#
1574############################################################################
1575
1576
1577my $alta_vista_url = "http://www.altavista.com/web/results" .
1578                     "?pg=aq" .
1579                     "&aqmode=s" .
1580                     "&filetype=html" .
1581                     "&sc=on" .        # "site collapse"
1582                     "&nbq=50" .
1583                     "&aqo=";
1584
1585# avtext
1586sub pick_from_alta_vista_text($) {
1587  my ($timeout) = @_;
1588
1589  my $words = random_words('%20');
1590  my $page = (int(rand(9)) + 1);
1591  my $search_url = $alta_vista_url . $words;
1592
1593  if ($page > 1) {
1594    $search_url .= "&pgno=" . $page;
1595    $search_url .= "&stq=" . (($page-1) * 10);
1596  }
1597
1598  my ($search_hit_count, @subpages) =
1599    pick_from_search_engine ($timeout, $search_url, $words);
1600
1601  my @candidates = ();
1602  foreach my $u (@subpages) {
1603
1604    # Those altavista fuckers are playing really nasty redirection games
1605    # these days: the filter your clicks through their site, but use
1606    # onMouseOver to make it look like they're not!  Well, it makes it
1607    # easier for us to identify search results...
1608    #
1609    next unless ($u =~ s/^.*\*\*(http%3a.*$)/$1/gsi);
1610    $u = url_unquote($u);
1611
1612    next unless ($u =~ m@^https?://@i);    #  skip non-HTTP or relative URLs
1613    next if ($u =~ m@[/.]altavista\.com\b@i);     # skip altavista builtins
1614    next if ($u =~ m@[/.]yahoo\.com\b@i);         # yahoo and av in cahoots?
1615
1616    LOG ($verbose_filter, "  candidate: $u");
1617    push @candidates, $u;
1618  }
1619
1620  return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1621                                $timeout, @candidates);
1622}
1623
1624
1625
1626############################################################################
1627#
1628# Pick images by feeding random words into Hotbot
1629#
1630############################################################################
1631
1632my $hotbot_search_url =("http://hotbot.lycos.com/default.asp" .
1633                        "?ca=w" .
1634                        "&descriptiontype=0" .
1635                        "&imagetoggle=1" .
1636                        "&matchmode=any" .
1637                        "&nummod=2" .
1638                        "&recordcount=50" .
1639                        "&sitegroup=1" .
1640                        "&stem=1" .
1641                        "&cobrand=undefined" .
1642                        "&query=");
1643
1644sub pick_from_hotbot_text($) {
1645  my ($timeout) = @_;
1646
1647  $last_search = $hotbot_search_url;   # for warnings
1648
1649  # lycos seems to always give us back dictionaries and word lists if
1650  # we search for more than one word...
1651  #
1652  my $words = random_word();
1653
1654  my $start = int(rand(8)) * 10 + 1;
1655  my $search_url = $hotbot_search_url . $words . "&first=$start&page=more";
1656
1657  my ($search_hit_count, @subpages) =
1658    pick_from_search_engine ($timeout, $search_url, $words);
1659
1660  my @candidates = ();
1661  foreach my $u (@subpages) {
1662
1663    # Hotbot plays redirection games too
1664    # (not any more?)
1665#    next unless ($u =~ m@/director.asp\?.*\btarget=([^&]+)@);
1666#    $u = url_decode($1);
1667
1668    next unless ($u =~ m@^https?://@i);    #  skip non-HTTP or relative URLs
1669    next if ($u =~ m@[/.]hotbot\.com\b@i);     # skip hotbot builtins
1670    next if ($u =~ m@[/.]lycos\.com\b@i);      # skip hotbot builtins
1671    next if ($u =~ m@[/.]inktomi\.com\b@i);    # skip hotbot builtins
1672
1673    LOG ($verbose_filter, "  candidate: $u");
1674    push @candidates, $u;
1675  }
1676
1677  return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1678                                $timeout, @candidates);
1679}
1680
1681
1682
1683############################################################################
1684#
1685# Pick images by feeding random words into Lycos
1686#
1687############################################################################
1688
1689my $lycos_search_url = "http://search.lycos.com/default.asp" .
1690                       "?lpv=1" .
1691                       "&loc=searchhp" .
1692                       "&tab=web" .
1693                       "&query=";
1694
1695sub pick_from_lycos_text($) {
1696  my ($timeout) = @_;
1697
1698  $last_search = $lycos_search_url;   # for warnings
1699
1700  # lycos seems to always give us back dictionaries and word lists if
1701  # we search for more than one word...
1702  #
1703  my $words = random_word();
1704
1705  my $start = int(rand(8)) * 10 + 1;
1706  my $search_url = $lycos_search_url . $words . "&first=$start&page=more";
1707
1708  my ($search_hit_count, @subpages) =
1709    pick_from_search_engine ($timeout, $search_url, $words);
1710
1711  my @candidates = ();
1712  foreach my $u (@subpages) {
1713
1714    # Lycos plays redirection games.
1715    # (not any more?)
1716#    next unless ($u =~ m@^https?://click.lycos.com/director.asp
1717#                         .*
1718#                         \btarget=([^&]+)
1719#                         .*
1720#                        @x);
1721#    $u = url_decode($1);
1722
1723    next unless ($u =~ m@^https?://@i);    #  skip non-HTTP or relative URLs
1724    next if ($u =~ m@[/.]hotbot\.com\b@i);     # skip lycos builtins
1725    next if ($u =~ m@[/.]lycos\.com\b@i);      # skip lycos builtins
1726    next if ($u =~ m@[/.]terralycos\.com\b@i); # skip lycos builtins
1727    next if ($u =~ m@[/.]inktomi\.com\b@i);    # skip lycos builtins
1728
1729
1730    LOG ($verbose_filter, "  candidate: $u");
1731    push @candidates, $u;
1732  }
1733
1734  return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1735                                $timeout, @candidates);
1736}
1737
1738
1739
1740############################################################################
1741#
1742# Pick images by feeding random words into news.yahoo.com
1743#
1744############################################################################
1745
1746my $yahoo_news_url = "http://news.search.yahoo.com/search/news" .
1747                     "?c=news_photos" .
1748                     "&p=";
1749
1750# yahoonews
1751sub pick_from_yahoo_news_text($) {
1752  my ($timeout) = @_;
1753
1754  $last_search = $yahoo_news_url;   # for warnings
1755
1756  my $words = random_word();
1757  my $search_url = $yahoo_news_url . $words;
1758
1759  my ($search_hit_count, @subpages) =
1760    pick_from_search_engine ($timeout, $search_url, $words);
1761
1762  my @candidates = ();
1763  foreach my $u (@subpages) {
1764
1765    # de-redirectize the URLs
1766    $u =~ s@^https?://rds\.yahoo\.com/.*-http%3A@http:@s;
1767
1768    # only accept URLs on Yahoo's news site
1769    next unless ($u =~ m@^https?://dailynews\.yahoo\.com/@i ||
1770                 $u =~ m@^https?://story\.news\.yahoo\.com/@i);
1771    next unless ($u =~ m@&u=/@);
1772
1773    LOG ($verbose_filter, "  candidate: $u");
1774    push @candidates, $u;
1775  }
1776
1777  return pick_image_from_pages ($search_url, $search_hit_count, $#subpages+1,
1778                                $timeout, @candidates);
1779}
1780
1781
1782
1783############################################################################
1784#
1785# Pick images from LiveJournal's list of recently-posted images.
1786#
1787############################################################################
1788
1789my $livejournal_img_url = "http://www.livejournal.com/stats/latest-img.bml";
1790
1791# With most of our image sources, we get a random page and then select
1792# from the images on it.  However, in the case of LiveJournal, the page
1793# of images tends to update slowly; so we'll remember the last N entries
1794# on it and randomly select from those, to get a wider variety each time.
1795
1796my $lj_cache_size = 1000;
1797my @lj_cache = (); # fifo, for ordering by age
1798my %lj_cache = (); # hash, for detecting dups
1799
1800# livejournal
1801sub pick_from_livejournal_images($) {
1802  my ($timeout) = @_;
1803
1804  $last_search = $livejournal_img_url;   # for warnings
1805
1806  my ( $base, $body ) = get_document ($livejournal_img_url, undef, $timeout);
1807
1808  # Often the document comes back empty. If so, just use the cache.
1809  # return () unless $body;
1810  $body = '' unless defined($body);
1811
1812  $body =~ s/\n/ /gs;
1813  $body =~ s/(<recent-image)\b/\n$1/gsi;
1814
1815  foreach (split (/\n/, $body)) {
1816    next unless (m/^<recent-image\b/);
1817    next unless (m/\bIMG=[\'\"]([^\'\"]+)[\'\"]/si);
1818    my $img = html_unquote ($1);
1819
1820    next if ($lj_cache{$img}); # already have it
1821
1822    next unless (m/\bURL=[\'\"]([^\'\"]+)[\'\"]/si);
1823    my $page = html_unquote ($1);
1824    my @pair = ($img, $page);
1825    LOG ($verbose_filter, "  candidate: $img");
1826    push @lj_cache, \@pair;
1827    $lj_cache{$img} = \@pair;
1828  }
1829
1830  return () if ($#lj_cache == -1);
1831
1832  my $n = $#lj_cache+1;
1833  my $i = int(rand($n));
1834  my ($img, $page) = @{$lj_cache[$i]};
1835
1836  # delete this one from @lj_cache and from %lj_cache.
1837  #
1838  @lj_cache = ( @lj_cache[0 .. $i-1],
1839                @lj_cache[$i+1 .. $#lj_cache] );
1840  delete $lj_cache{$img};
1841
1842  # Keep the size of the cache under the limit by nuking older entries
1843  #
1844  while ($#lj_cache >= $lj_cache_size) {
1845    my $pairP = shift @lj_cache;
1846    my $img = $pairP->[0];
1847    delete $lj_cache{$img};
1848  }
1849
1850  LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
1851
1852  return ($page, $img);
1853}
1854
1855
1856############################################################################
1857#
1858# Pick images from ircimages.com (images that have been in the /topic of
1859# various IRC channels.)
1860#
1861############################################################################
1862
1863my $ircimages_url = "http://ircimages.com/";
1864
1865# ircimages
1866sub pick_from_ircimages($) {
1867  my ($timeout) = @_;
1868
1869  $last_search = $ircimages_url;   # for warnings
1870
1871  my $n = int(rand(2900));
1872  my $search_url = $ircimages_url . "page-$n";
1873
1874  my ( $base, $body ) = get_document ($search_url, undef, $timeout);
1875  return () unless $body;
1876
1877  my @candidates = ();
1878
1879  $body =~ s/\n/ /gs;
1880  $body =~ s/(<A)\b/\n$1/gsi;
1881
1882  foreach (split (/\n/, $body)) {
1883
1884    my ($u) = m@<A\s.*\bHREF\s*=\s*([^>]+)>@i;
1885    next unless $u;
1886
1887    if ($u =~ m/^\"([^\"]*)\"/) { $u = $1; }   # quoted string
1888    elsif ($u =~ m/^([^\s]*)\s/) { $u = $1; }  # or token
1889
1890    next unless ($u =~ m/^https?:/i);
1891    next if ($u =~ m@^https?://(searchirc\.com\|ircimages\.com)@i);
1892    next unless ($u =~ m@[.](gif|jpg|jpeg|pjpg|pjpeg|png)$@i);
1893
1894    LOG ($verbose_http, "    HREF: $u");
1895    push @candidates, $u;
1896  }
1897
1898  LOG ($verbose_filter, "" . $#candidates+1 . " links on $search_url");
1899
1900  return () if ($#candidates == -1);
1901
1902  my $i = int(rand($#candidates+1));
1903  my $img = $candidates[$i];
1904
1905  LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#candidates+1) .
1906       ": $img");
1907
1908  $search_url = $img;  # hmm...
1909  return ($search_url, $img);
1910}
1911
1912
1913############################################################################
1914#
1915# Pick images from Twitpic's list of recently-posted images.
1916#
1917############################################################################
1918
1919my $twitpic_img_url = "http://twitpic.com/public_timeline/feed.rss";
1920
1921# With most of our image sources, we get a random page and then select
1922# from the images on it.  However, in the case of Twitpic, the page
1923# of images tends to update slowly; so we'll remember the last N entries
1924# on it and randomly select from those, to get a wider variety each time.
1925
1926my $twitpic_cache_size = 1000;
1927my @twitpic_cache = (); # fifo, for ordering by age
1928my %twitpic_cache = (); # hash, for detecting dups
1929
1930# twitpic
1931sub pick_from_twitpic_images($) {
1932  my ($timeout) = @_;
1933
1934  $last_search = $twitpic_img_url;   # for warnings
1935
1936  my ( $base, $body ) = get_document ($twitpic_img_url, undef, $timeout);
1937
1938  # Update the cache.
1939
1940  if ($body) {
1941    $body =~ s/\n/ /gs;
1942    $body =~ s/(<item)\b/\n$1/gsi;
1943
1944    my @items = split (/\n/, $body);
1945    shift @items;
1946    foreach (@items) {
1947      next unless (m@<link>([^<>]*)</link>@si);
1948      my $page = html_unquote ($1);
1949
1950      $page =~ s@/$@@s;
1951      $page .= '/full';
1952
1953      next if ($twitpic_cache{$page}); # already have it
1954
1955      LOG ($verbose_filter, "  candidate: $page");
1956      push @twitpic_cache, $page;
1957      $twitpic_cache{$page} = $page;
1958    }
1959  }
1960
1961  # Pull from the cache.
1962
1963  return () if ($#twitpic_cache == -1);
1964
1965  my $n = $#twitpic_cache+1;
1966  my $i = int(rand($n));
1967  my $page = $twitpic_cache[$i];
1968
1969  # delete this one from @twitpic_cache and from %twitpic_cache.
1970  #
1971  @twitpic_cache = ( @twitpic_cache[0 .. $i-1],
1972                     @twitpic_cache[$i+1 .. $#twitpic_cache] );
1973  delete $twitpic_cache{$page};
1974
1975  # Keep the size of the cache under the limit by nuking older entries
1976  #
1977  while ($#twitpic_cache >= $twitpic_cache_size) {
1978    my $page = shift @twitpic_cache;
1979    delete $twitpic_cache{$page};
1980  }
1981
1982  ( $base, $body ) = get_document ($page, undef, $timeout);
1983  my $img = undef;
1984  $body = '' unless defined($body);
1985
1986  foreach (split (/<img\s+/, $body)) {
1987    my ($src) = m/\bsrc=[\"\'](.*?)[\"\']/si;
1988    next unless $src;
1989    next if m@/js/@s;
1990    next if m@/images/@s;
1991
1992    $img = $src;
1993
1994    $img = "http:$img" if ($img =~ m@^//@s);  # Oh come on
1995
1996    # Sometimes these images are hosted on twitpic, sometimes on Amazon.
1997    if ($img =~ m@^/@) {
1998      $base =~ s@^(https?://[^/]+)/.*@$1@s;
1999      $img = $base . $img;
2000    }
2001    last;
2002  }
2003
2004  if (!$img) {
2005    LOG ($verbose_load, "no matching images on $page\n");
2006    return ();
2007  }
2008
2009  LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
2010
2011  return ($page, $img);
2012}
2013
2014
2015############################################################################
2016#
2017# Pick images from Twitter's list of recently-posted updates.
2018#
2019############################################################################
2020
2021# With most of our image sources, we get a random page and then select
2022# from the images on it.  However, in the case of Twitter, the page
2023# of images only updates once a minute; so we'll remember the last N entries
2024# on it and randomly select from those, to get a wider variety each time.
2025
2026my $twitter_img_url = "http://api.twitter.com/1/statuses/" .
2027		      "public_timeline.json" .
2028		      "?include_entities=true" .
2029		      "&include_rts=true" .
2030		      "&count=200";
2031
2032my $twitter_cache_size = 1000;
2033
2034my @twitter_cache = (); # fifo, for ordering by age
2035my %twitter_cache = (); # hash, for detecting dups
2036
2037
2038# twitter
2039sub pick_from_twitter_images($) {
2040  my ($timeout) = @_;
2041
2042  $last_search = $twitter_img_url;   # for warnings
2043
2044  my ( $base, $body ) = get_document ($twitter_img_url, undef, $timeout);
2045  # Update the cache.
2046
2047  if ($body) {
2048    $body =~ s/[\r\n]+/ /gs;
2049
2050    # Parsing JSON is a pain in the ass.  So we halfass it as usual.
2051    $body =~ s/^\[|\]$//s;
2052    $body =~ s/(\[.*?\])/{ $_ = $1; s@\},@\} @gs; $_; }/gsexi;
2053    my @items = split (/\},\{/, $body);
2054    foreach (@items) {
2055      my ($name) = m@"screen_name":"([^\"]+)"@si;
2056      my ($img)  = m@"media_url":"([^\"]+)"@si;
2057      my ($page) = m@"display_url":"([^\"]+)"@si;
2058      next unless ($name && $img && $page);
2059      foreach ($img, $page) {
2060        s/\\//gs;
2061        $_ = "http://$_" unless (m/^http/si);
2062      }
2063
2064      next if ($twitter_cache{$page}); # already have it
2065
2066      LOG ($verbose_filter, "  candidate: $page - $img");
2067      push @twitter_cache, $page;
2068      $twitter_cache{$page} = $img;
2069    }
2070  }
2071
2072  # Pull from the cache.
2073
2074  return () if ($#twitter_cache == -1);
2075
2076  my $n = $#twitter_cache+1;
2077  my $i = int(rand($n));
2078  my $page = $twitter_cache[$i];
2079  my $url  = $twitter_cache{$page};
2080
2081  # delete this one from @twitter_cache and from %twitter_cache.
2082  #
2083  @twitter_cache = ( @twitter_cache[0 .. $i-1],
2084                     @twitter_cache[$i+1 .. $#twitter_cache] );
2085  delete $twitter_cache{$page};
2086
2087  # Keep the size of the cache under the limit by nuking older entries
2088  #
2089  while ($#twitter_cache >= $twitter_cache_size) {
2090    my $page = shift @twitter_cache;
2091    delete $twitter_cache{$page};
2092  }
2093
2094  LOG ($verbose_load, "picked page $url");
2095
2096  $suppress_audit = 1;
2097
2098  return ($page, $url);
2099}
2100
2101
2102############################################################################
2103#
2104# Pick images from Flickr's page of recently-posted photos.
2105#
2106############################################################################
2107
2108my $flickr_img_url = "http://www.flickr.com/explore/";
2109
2110# Like LiveJournal, the Flickr page of images tends to update slowly,
2111# so remember the last N entries on it and randomly select from those.
2112
2113# I know that Flickr has an API (http://www.flickr.com/services/api/)
2114# but it was easy enough to scrape the HTML, so I didn't bother exploring.
2115
2116my $flickr_cache_size = 1000;
2117my @flickr_cache = (); # fifo, for ordering by age
2118my %flickr_cache = (); # hash, for detecting dups
2119
2120
2121# flickr_recent
2122sub pick_from_flickr_recent($) {
2123  my ($timeout) = @_;
2124
2125  my $start = 16 * int(rand(100));
2126
2127  $last_search = $flickr_img_url;   # for warnings
2128  $last_search .= "?start=$start" if ($start > 0);
2129
2130  my ( $base, $body ) = get_document ($last_search, undef, $timeout);
2131
2132  # If the document comes back empty. just use the cache.
2133  # return () unless $body;
2134  $body = '' unless defined($body);
2135
2136  my $count = 0;
2137  my $count2 = 0;
2138
2139  if ($body =~ m@{ *"_data": \[ ( .*? \} ) \]@six) {
2140    $body = $1;
2141  } else {
2142    LOG ($verbose_load, "flickr unparsable: $last_search");
2143    return ();
2144  }
2145
2146  $body =~ s/[\r\n]/ /gs;
2147  $body =~ s/(\},) *(\{)/$1\n$2/gs;     # "_flickrModelRegistry"
2148
2149  foreach my $chunk (split (/\n/, $body)) {
2150    my ($img) = ($chunk =~ m@"displayUrl": *"(.*?)"@six);
2151    next unless defined ($img);
2152    $img =~ s/\\//gs;
2153    $img = "http:$img" unless ($img =~ m/^http/s);
2154
2155    my ($user) = ($chunk =~ m/"pathAlias": *"(.*?)"/si);
2156    next unless defined ($user);
2157
2158    my ($id) = ($img =~ m@/\d+/(\d+)_([\da-f]+)_@si);
2159    my ($page) = "https://www.flickr.com/photos/$user/$id/";
2160
2161    # $img =~ s/_[a-z](\.[a-z\d]+)$/$1/si;  # take off "thumb" suffix
2162
2163    $count++;
2164    next if ($flickr_cache{$img}); # already have it
2165
2166    my @pair = ($img, $page, $start);
2167    LOG ($verbose_filter, "  candidate: $img");
2168    push @flickr_cache, \@pair;
2169    $flickr_cache{$img} = \@pair;
2170    $count2++;
2171  }
2172
2173  return () if ($#flickr_cache == -1);
2174
2175  my $n = $#flickr_cache+1;
2176  my $i = int(rand($n));
2177  my ($img, $page) = @{$flickr_cache[$i]};
2178
2179  # delete this one from @flickr_cache and from %flickr_cache.
2180  #
2181  @flickr_cache = ( @flickr_cache[0 .. $i-1],
2182                    @flickr_cache[$i+1 .. $#flickr_cache] );
2183  delete $flickr_cache{$img};
2184
2185  # Keep the size of the cache under the limit by nuking older entries
2186  #
2187  while ($#flickr_cache >= $flickr_cache_size) {
2188    my $pairP = shift @flickr_cache;
2189    my $img = $pairP->[0];
2190    delete $flickr_cache{$img};
2191  }
2192
2193  LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
2194
2195  return ($page, $img);
2196}
2197
2198
2199############################################################################
2200#
2201# Pick images from a random RSS feed on Flickr.
2202#
2203############################################################################
2204
2205my $flickr_rss_base = ("http://www.flickr.com/services/feeds/" .
2206                       "photos_public.gne" .
2207                       "?format=rss_200_enc&tagmode=any&tags=");
2208
2209# Picks a random RSS feed; picks a random image from that feed;
2210# returns 2 URLs: the page containing the image, and the image.
2211# Mostly by Joe Mcmahon <mcmahon@yahoo-inc.com>
2212#
2213# flickr_random
2214sub pick_from_flickr_random($) {
2215  my $timeout = shift;
2216
2217  my $words = random_words(',');
2218  my $rss = $flickr_rss_base . $words;
2219  $last_search = $rss;
2220
2221  $_ = $words;
2222  s/,/ /g;
2223
2224  print STDERR "\n\n" if ($verbose_load);
2225  LOG ($verbose_load, "words: $_");
2226  LOG ($verbose_load, "URL: $last_search");
2227
2228  $suppress_audit = 1;
2229
2230  my ( $base, $body ) = get_document ($last_search, undef, $timeout);
2231  if (!$base || !$body) {
2232    $body = undef;
2233    return;
2234  }
2235
2236  my $img;
2237  ($base, $img) = pick_image_from_rss ($base, $body);
2238  $body = undef;
2239  return () unless defined ($img);
2240
2241  LOG ($verbose_load, "redirected to: $base");
2242  return ($base, $img);
2243}
2244
2245
2246############################################################################
2247#
2248# Pick random images from Instagram.
2249#
2250############################################################################
2251
2252my $instagram_url_base = "https://api.instagram.com/v1/media/popular";
2253
2254# instagram_random
2255sub pick_from_instagram($) {
2256  my $timeout = shift;
2257
2258  # Liberated access tokens.
2259  # jsdo.it search for: instagram client_id
2260  # Google search for: instagram "&client_id=" site:jsfiddle.net
2261  my @tokens = (#'b59fbe4563944b6c88cced13495c0f49', # gramfeed.com
2262                #'fa26679250df49c48a33fbcf30aae989', # instac.at
2263                #'d9494686198d4dfeb954979a3e270e5e', # iconosquare.com
2264                #'793ef48bb18e4197b61afce2d799b81c', # jsdo.it
2265                #'67b8a3e0073449bba70600d0fc68e6cb', # jsdo.it
2266                #'26a098e0df4d4b9ea8b4ce6c505b7742', # jsdo.it
2267                #'2437cbcd906a4c10940f990d283d3cd5', # jsdo.it
2268                #'191c7d7d5312464cbd92134f36ffdab5', # jsdo.it
2269                #'acfec809437b4340b2c38f66503af774', # jsdo.it
2270                #'e9f77604a3a24beba949c12d18130988', # jsdo.it
2271                #'2cd7bcf68ae346529770073d311575b3', # jsdo.it
2272                #'830c600fe8d742e2ab3f3b94f9bb22b7', # jsdo.it
2273                #'55865a0397ad41e5997dd95ef4df8da1', # jsdo.it
2274                #'192a5742f3644ea8bed1d25e439286a8', # jsdo.it
2275                #'38ed1477e7a44595861b8842cdb8ba23', # jsdo.it
2276                #'e52f79f645f54488ad0cc47f6f55ade6', # jsfiddle.net
2277                );
2278
2279  my $tok = $tokens[int(rand($#tokens+1))];
2280  $last_search = $instagram_url_base . "?client_id=" . $tok;
2281
2282  print STDERR "\n\n" if ($verbose_load);
2283  LOG ($verbose_load, "URL: $last_search");
2284
2285  my ( $base, $body ) = get_document ($last_search, undef, $timeout);
2286  if (!$base || !$body) {
2287    $body = undef;
2288    return;
2289  }
2290
2291  $body =~ s/("link")/\001$1/gs;
2292  my @chunks = split(/\001/, $body);
2293  shift @chunks;
2294  my @urls = ();
2295  foreach (@chunks) {
2296    s/\\//gs;
2297    my ($url) = m/"link":\s*"(.*?)"/s;
2298    my ($img) = m/"standard_resolution":\{"url":\s*"(.*?)"/s;
2299       ($img) = m/"url":\s*"(.*?)"/s unless $url;
2300    next unless ($url && $img);
2301    push @urls, [ $url, $img ];
2302  }
2303
2304  if ($#urls < 0) {
2305    LOG ($verbose_load, "no images on $last_search");
2306    return ();
2307  }
2308
2309  my $i = int(rand($#urls+1));
2310  my ($url, $img) = @{$urls[$i]};
2311
2312  LOG ($verbose_load, "picked image " .($i+1) . "/" . ($#urls+1) . ": $url");
2313  return ($url, $img);
2314}
2315
2316
2317############################################################################
2318#
2319# Pick images from Imgur.
2320#
2321############################################################################
2322
2323my $imgur_base = 'http://imgur.com/search?qs=thumb&q_any=';
2324
2325sub pick_from_imgur($) {
2326  my $timeout = shift;
2327
2328  my $words = random_words('%20');
2329  $last_search = $imgur_base . $words;
2330
2331  $_ = $words;
2332  s/%20/ /g;
2333
2334  print STDERR "\n\n" if ($verbose_load);
2335  LOG ($verbose_load, "words: $_");
2336  LOG ($verbose_load, "URL: $last_search");
2337
2338  $suppress_audit = 1;
2339
2340  my ( $base, $body ) = get_document ($last_search, undef, $timeout);
2341  if (!$base || !$body) {
2342    $body = undef;
2343    return;
2344  }
2345
2346  my @imgs = ($body =~ m@\bHREF=[\"\']([^\'\"<>]*/gallery/[^\'\"<>]+)@gsi);
2347  return () unless @imgs;
2348
2349  my $n = @imgs;
2350  my $i = int(rand($n));
2351  my $page = $imgs[$i];
2352  $page =~ s/[?&].*$//s;
2353  $page = "http://imgur.com$page" if ($page =~ m@^/@s);
2354
2355  my ($id) = ($page =~ m@([^/?&]+)$@s);
2356  my $img = "http://i.imgur.com/$id.jpg";
2357
2358  LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
2359
2360  return ($page, $img);
2361}
2362
2363
2364############################################################################
2365#
2366# Pick images from Tumblr.
2367#
2368############################################################################
2369
2370my $tumblr_base = 'https://www.tumblr.com/search/';
2371
2372sub pick_from_tumblr($) {
2373  my $timeout = shift;
2374
2375  # Tumblr doesn't have an "or" search, which means our vocabulary is
2376  # a bit too extensive to work well...
2377
2378  my $words = random_word();
2379  $last_search = $tumblr_base . $words;
2380
2381  print STDERR "\n\n" if ($verbose_load);
2382  LOG ($verbose_load, "words: $words");
2383  LOG ($verbose_load, "URL: $last_search");
2384
2385  $suppress_audit = 1;
2386
2387  my ( $base, $body ) = get_document ($last_search, undef, $timeout);
2388  if (!$base || !$body) {
2389    $body = undef;
2390    return;
2391  }
2392
2393  my @imgs0 = ($body =~ m@<IMG\b([^<>]*)>@gsi);
2394  return () unless @imgs0;
2395  my @imgs;
2396  foreach my $img (@imgs0) {
2397    my ($src)  = ($img =~ m@\bsrc=[\"\'](.*?)[\"\']@si);
2398    my ($href) = ($img =~ m@\bdata-pin-url=[\"\'](.*?)[\"\']@si);
2399    next unless ($src && $href);
2400    next if ($src =~ m/^data:/s);
2401    foreach ($src, $href) { $_ = "http://www.tumblr.com$_" if (m@^/@s); }
2402    push @imgs, [$href, $src];
2403  }
2404  return () unless @imgs;
2405
2406  my $n = @imgs;
2407  my $i = int(rand($n));
2408  my $page = $imgs[$i]->[0];
2409  my $img  = $imgs[$i]->[1];
2410
2411  LOG ($verbose_load, "picked image " .($i+1) . "/$n: $img");
2412
2413  return ($page, $img);
2414}
2415
2416
2417############################################################################
2418#
2419# Pick images by waiting for driftnet to populate a temp dir with files.
2420# Requires driftnet version 0.1.5 or later.
2421# (Driftnet is a program by Chris Lightfoot that sniffs your local ethernet
2422# for images being downloaded by others.)
2423# Driftnet/webcollage integration by jwz.
2424#
2425############################################################################
2426
2427# driftnet
2428sub pick_from_driftnet($) {
2429  my ($timeout) = @_;
2430
2431  my $id = $driftnet_magic;
2432  my $dir = $driftnet_dir;
2433  my $start = time;
2434  my $now;
2435
2436  error ("\$driftnet_dir unset?") unless ($dir);
2437  $dir =~ s@/+$@@;
2438
2439  error ("$dir unreadable") unless (-d "$dir/.");
2440
2441  $timeout = $http_timeout unless ($timeout);
2442  $last_search = $id;
2443
2444  while ($now = time, $now < $start + $timeout) {
2445    opendir (my $dir, $dir) || error ("$dir: $!");
2446    while (my $file = readdir($dir)) {
2447      next if ($file =~ m/^\./);
2448      $file = "$dir/$file";
2449      closedir ($dir);
2450      LOG ($verbose_load, "picked file $file ($id)");
2451      return ($id, $file);
2452    }
2453    closedir ($dir);
2454  }
2455  LOG (($verbose_net || $verbose_load), "timed out for $id");
2456  return ();
2457}
2458
2459
2460sub get_driftnet_file($) {
2461  my ($file) = @_;
2462
2463  error ("\$driftnet_dir unset?") unless ($driftnet_dir);
2464
2465  my $id = $driftnet_magic;
2466  error ("$id: $file not in $driftnet_dir?")
2467    unless ($file =~ m@^\Q$driftnet_dir@o);
2468
2469  open (my $in, '<', $file) || error ("$id: $file: $!");
2470  my $body = '';
2471  local $/ = undef;  # read entire file
2472  $body = <$in>;
2473  close ($in) || error ("$id: $file: $!");
2474  unlink ($file) || error ("$id: $file: rm: $!");
2475  return ($id, $body);
2476}
2477
2478
2479sub spawn_driftnet($) {
2480  my ($cmd) = @_;
2481
2482  # make a directory to use.
2483  while (1) {
2484    my $tmp = $ENV{TEMPDIR} || "/tmp";
2485    $driftnet_dir = sprintf ("$tmp/driftcollage-%08x", rand(0xffffffff));
2486    LOG ($verbose_exec, "mkdir $driftnet_dir");
2487    last if mkdir ($driftnet_dir, 0700);
2488  }
2489
2490  if (! ($cmd =~ m/\s/)) {
2491    # if the command didn't have any arguments in it, then it must be just
2492    # a pointer to the executable.  Append the default args to it.
2493    my $dargs = $default_driftnet_cmd;
2494    $dargs =~ s/^[^\s]+//;
2495    $cmd .= $dargs;
2496  }
2497
2498  # point the driftnet command at our newly-minted private directory.
2499  #
2500  $cmd .= " -d $driftnet_dir";
2501  $cmd .= ">/dev/null" unless ($verbose_exec);
2502
2503  my $pid = fork();
2504  if ($pid < 0) { error ("fork: $!\n"); }
2505  if ($pid) {
2506    # parent fork
2507    push @pids_to_kill, $pid;
2508    LOG ($verbose_exec, "forked for \"$cmd\"");
2509  } else {
2510    # child fork
2511    nontrapping_system ($cmd) || error ("exec: $!");
2512  }
2513
2514  # wait a bit, then make sure the process actually started up.
2515  #
2516  sleep (1);
2517  error ("pid $pid failed to start \"$cmd\"")
2518    unless (1 == kill (0, $pid));
2519}
2520
2521# local-directory
2522sub pick_from_local_dir($) {
2523  my ($timeout) = @_;
2524
2525  my $id = $local_magic;
2526  $last_search = $id;
2527
2528  my $dir = $local_dir;
2529  error ("\$local_dir unset?") unless ($dir);
2530  $dir =~ s@/+$@@;
2531
2532  error ("$dir unreadable") unless (-d "$dir/.");
2533
2534  my $v = ($verbose_exec ? "-v" : "");
2535  my $pick = `xscreensaver-getimage-file $v "$dir"`;
2536  $pick =~ s/\s+$//s;
2537  $pick = "$dir/$pick" unless ($pick =~ m@^/@s);       # relative path
2538
2539  LOG ($verbose_load, "picked file $pick ($id)");
2540  return ($id, $pick);
2541}
2542
2543
2544sub get_local_file($) {
2545  my ($file) = @_;
2546
2547  error ("\$local_dir unset?") unless ($local_dir);
2548
2549  my $id = $local_magic;
2550  error ("$id: $file not in $local_dir?")
2551    unless ($file =~ m@^\Q$local_dir@o);
2552
2553  open (my $in, '<:raw', $file) || error ("$id: $file: $!");
2554  local $/ = undef;  # read entire file
2555  my $body = <$in>;
2556  close ($in) || error ("$id: $file: $!");
2557  return ($id, $body);
2558}
2559
2560
2561
2562############################################################################
2563#
2564# Pick a random image in a random way
2565#
2566############################################################################
2567
2568
2569# Picks a random image on a random page, and returns two URLs:
2570# the page containing the image, and the image.
2571# Returns () if nothing found this time.
2572#
2573
2574sub pick_image(;$) {
2575  my ($timeout) = @_;
2576
2577  $current_state = "select";
2578  $load_method = "none";
2579
2580  my $n = int(rand(100));
2581  my $fn = undef;
2582  my $total = 0;
2583  my @rest = @search_methods;
2584
2585  while (@rest) {
2586    my $pct  = shift @rest;
2587    my $name = shift @rest;
2588    my $tfn  = shift @rest;
2589    $total += $pct;
2590    if ($total > $n && !defined($fn)) {
2591      $fn = $tfn;
2592      $current_state = $name;
2593      $load_method = $current_state;
2594    }
2595  }
2596
2597  if ($total != 100) {
2598    error ("internal error: \@search_methods totals to $total%!");
2599  }
2600
2601  record_attempt ($current_state);
2602  return $fn->($timeout);
2603}
2604
2605
2606
2607############################################################################
2608#
2609# Statistics and logging
2610#
2611############################################################################
2612
2613sub timestr() {
2614  return strftime ("%H:%M:%S: ", localtime);
2615}
2616
2617sub blurb() {
2618  return "$progname: " . timestr() . "$current_state: ";
2619}
2620
2621sub error($) {
2622  my ($err) = @_;
2623  print STDERR blurb() . "$err\n";
2624  exit 1;
2625}
2626
2627sub stacktrace() {
2628  my $i = 1;
2629  print STDERR "$progname: stack trace:\n";
2630  while (1) {
2631    my ($package, $filename, $line, $subroutine) = caller($i++);
2632    last unless defined($package);
2633    $filename =~ s@^.*/@@;
2634    print STDERR "  $filename#$line, $subroutine\n";
2635  }
2636}
2637
2638
2639my $lastlog = "";
2640
2641sub clearlog() {
2642  $lastlog = "";
2643}
2644
2645sub showlog() {
2646  my $head = "$progname: DEBUG: ";
2647  foreach (split (/\n/, $lastlog)) {
2648    print STDERR "$head$_\n";
2649  }
2650  $lastlog = "";
2651}
2652
2653sub LOG($$) {
2654  my ($print, $msg) = @_;
2655  my $blurb = timestr() . "$current_state: ";
2656  $lastlog .= "$blurb$msg\n";
2657  print STDERR "$progname: $blurb$msg\n" if $print;
2658}
2659
2660
2661my %stats_attempts;
2662my %stats_successes;
2663my %stats_elapsed;
2664
2665my $last_state = undef;
2666sub record_attempt($) {
2667  my ($name) = @_;
2668
2669  if ($last_state) {
2670    record_failure($last_state) unless ($image_succeeded > 0);
2671  }
2672  $last_state = $name;
2673
2674  clearlog();
2675  report_performance();
2676
2677  start_timer($name);
2678  $image_succeeded = 0;
2679  $suppress_audit = 0;
2680}
2681
2682sub record_success($$$) {
2683  my ($name, $url, $base) = @_;
2684  if (defined($stats_successes{$name})) {
2685    $stats_successes{$name}++;
2686  } else {
2687    $stats_successes{$name} = 1;
2688  }
2689
2690  stop_timer ($name, 1);
2691  my $o = $current_state;
2692  $current_state = $name;
2693  save_recent_url ($url, $base);
2694  $current_state = $o;
2695  $image_succeeded = 1;
2696  clearlog();
2697}
2698
2699
2700sub record_failure($) {
2701  my ($name) = @_;
2702
2703  return if $image_succeeded;
2704
2705  stop_timer ($name, 0);
2706  if ($verbose_load && !$verbose_exec) {
2707
2708    if ($suppress_audit) {
2709      print STDERR "$progname: " . timestr() . "(audit log suppressed)\n";
2710      return;
2711    }
2712
2713    my $o = $current_state;
2714    $current_state = "DEBUG";
2715
2716    my $line =  "#" x 78;
2717    print STDERR "\n\n\n";
2718    print STDERR ("#" x 78) . "\n";
2719    print STDERR blurb() . "failed to get an image.  Full audit log:\n";
2720    print STDERR "\n";
2721    showlog();
2722    print STDERR ("-" x 78) . "\n";
2723    print STDERR "\n\n";
2724
2725    $current_state = $o;
2726  }
2727  $image_succeeded = 0;
2728}
2729
2730
2731
2732sub stats_of($) {
2733  my ($name) = @_;
2734  my $i = $stats_successes{$name};
2735  my $j = $stats_attempts{$name};
2736  $i = 0 unless $i;
2737  $j = 0 unless $j;
2738  return "" . ($j ? int($i * 100 / $j) : "0") . "%";
2739}
2740
2741
2742my $current_start_time = 0;
2743
2744sub start_timer($) {
2745  my ($name) = @_;
2746  $current_start_time = time;
2747
2748  if (defined($stats_attempts{$name})) {
2749    $stats_attempts{$name}++;
2750  } else {
2751    $stats_attempts{$name} = 1;
2752  }
2753  if (!defined($stats_elapsed{$name})) {
2754    $stats_elapsed{$name} = 0;
2755  }
2756}
2757
2758sub stop_timer($$) {
2759  my ($name, $success) = @_;
2760  $stats_elapsed{$name} += time - $current_start_time;
2761}
2762
2763
2764my $last_report_time = 0;
2765sub report_performance() {
2766
2767  return unless $verbose_warnings;
2768
2769  my $now = time;
2770  return unless ($now >= $last_report_time + $report_performance_interval);
2771  my $ot = $last_report_time;
2772  $last_report_time = $now;
2773
2774  return if ($ot == 0);
2775
2776  my $blurb = "$progname: " . timestr();
2777
2778  print STDERR "\n";
2779  print STDERR "${blurb}Current standings:\n";
2780
2781  foreach my $name (sort keys (%stats_attempts)) {
2782    my $try = $stats_attempts{$name};
2783    my $suc = $stats_successes{$name} || 0;
2784    my $pct = int($suc * 100 / $try);
2785    my $secs = $stats_elapsed{$name};
2786    my $secs_link = $secs / $try;
2787    print STDERR sprintf ("$blurb %-14s %4s (%d/%d);" .
2788                          "       \t %.1f secs/link\n",
2789                          "$name:", "$pct%", $suc, $try, $secs_link);
2790  }
2791}
2792
2793
2794
2795my $max_recent_images = 400;
2796my $max_recent_sites  = 20;
2797my @recent_images = ();
2798my @recent_sites = ();
2799
2800sub save_recent_url($$) {
2801  my ($url, $base) = @_;
2802
2803  return unless ($verbose_warnings);
2804
2805  $_ = $url;
2806  my ($site) = m@^https?://([^ \t\n\r/:]+)@;
2807  return unless defined ($site);
2808
2809  if ($base eq $driftnet_magic || $base eq $local_magic) {
2810    $site = $base;
2811    @recent_images = ();
2812  }
2813
2814  my $done = 0;
2815  foreach (@recent_images) {
2816    if ($_ eq $url) {
2817      print STDERR blurb() . "WARNING: recently-duplicated image: $url" .
2818        " (on $base via $last_search)\n";
2819      $done = 1;
2820      last;
2821    }
2822  }
2823
2824  # suppress "duplicate site" warning via %warningless_sites.
2825  #
2826  if ($warningless_sites{$site}) {
2827    $done = 1;
2828  } elsif ($site =~ m@([^.]+\.[^.]+\.[^.]+)$@ &&
2829           $warningless_sites{$1}) {
2830    $done = 1;
2831  } elsif ($site =~ m@([^.]+\.[^.]+)$@ &&
2832           $warningless_sites{$1}) {
2833    $done = 1;
2834  }
2835
2836  if (!$done) {
2837    foreach (@recent_sites) {
2838      if ($_ eq $site) {
2839        print STDERR blurb() . "WARNING: recently-duplicated site: $site" .
2840        " ($url on $base via $last_search)\n";
2841        last;
2842      }
2843    }
2844  }
2845
2846  push @recent_images, $url;
2847  push @recent_sites,  $site;
2848  shift @recent_images if ($#recent_images >= $max_recent_images);
2849  shift @recent_sites  if ($#recent_sites  >= $max_recent_sites);
2850}
2851
2852
2853
2854##############################################################################
2855#
2856# other utilities
2857#
2858##############################################################################
2859
2860# Does %-decoding.
2861#
2862sub url_decode($) {
2863  ($_) = @_;
2864  tr/+/ /;
2865  s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
2866  return $_;
2867}
2868
2869
2870# Given the raw body of a GIF document, returns the dimensions of the image.
2871#
2872sub gif_size($) {
2873  my ($body) = @_;
2874  my $type = substr($body, 0, 6);
2875  my $s;
2876  return () unless ($type =~ /GIF8[7,9]a/);
2877  $s = substr ($body, 6, 10);
2878  my ($a,$b,$c,$d) = unpack ("C"x4, $s);
2879  return () unless defined ($d);
2880  return (($b<<8|$a), ($d<<8|$c));
2881}
2882
2883# Given the raw body of a JPEG document, returns the dimensions of the image.
2884#
2885sub jpeg_size($) {
2886  my ($body) = @_;
2887  my $i = 0;
2888  my $L = length($body);
2889
2890  my $c1 = substr($body, $i, 1); $i++;
2891  my $c2 = substr($body, $i, 1); $i++;
2892  return () unless (ord($c1) == 0xFF && ord($c2) == 0xD8);
2893
2894  my $ch = "0";
2895  while (ord($ch) != 0xDA && $i < $L) {
2896    # Find next marker, beginning with 0xFF.
2897    while (ord($ch) != 0xFF) {
2898      return () if (length($body) <= $i);
2899      $ch = substr($body, $i, 1); $i++;
2900    }
2901    # markers can be padded with any number of 0xFF.
2902    while (ord($ch) == 0xFF) {
2903      return () if (length($body) <= $i);
2904      $ch = substr($body, $i, 1); $i++;
2905    }
2906
2907    # $ch contains the value of the marker.
2908    my $marker = ord($ch);
2909
2910    if (($marker >= 0xC0) &&
2911        ($marker <= 0xCF) &&
2912        ($marker != 0xC4) &&
2913        ($marker != 0xCC)) {  # it's a SOFn marker
2914      $i += 3;
2915      return () if (length($body) <= $i);
2916      my $s = substr($body, $i, 4); $i += 4;
2917      my ($a,$b,$c,$d) = unpack("C"x4, $s);
2918      return (($c<<8|$d), ($a<<8|$b));
2919
2920    } else {
2921      # We must skip variables, since FFs in variable names aren't
2922      # valid JPEG markers.
2923      return () if (length($body) <= $i);
2924      my $s = substr($body, $i, 2); $i += 2;
2925      my ($c1, $c2) = unpack ("C"x2, $s);
2926      my $length = ($c1 << 8) | $c2;
2927      return () if ($length < 2);
2928      $i += $length-2;
2929    }
2930  }
2931  return ();
2932}
2933
2934# Given the raw body of a PNG document, returns the dimensions of the image.
2935#
2936sub png_size($) {
2937  my ($body) = @_;
2938  return () unless ($body =~ m/^\211PNG\r/);
2939  my ($bits) = ($body =~ m/^.{12}(.{12})/s);
2940  return () unless defined ($bits);
2941  return () unless ($bits =~ /^IHDR/);
2942  my ($ign, $w, $h) = unpack("a4N2", $bits);
2943  return ($w, $h);
2944}
2945
2946
2947# Given the raw body of a PNM document, returns the dimensions of the image.
2948#
2949sub pnm_size($) {
2950  my ($body) = @_;
2951  return () unless ($body =~ m/^P[1-6]\r?\n(\d+) +(\d+)\r?\n/s);
2952  return ($1, $2);
2953}
2954
2955
2956# Given the raw body of a GIF, JPEG, or PNG document, returns the dimensions
2957# of the image.
2958#
2959sub image_size($) {
2960  my ($body) = @_;
2961  my ($w, $h) = gif_size ($body);
2962  if ($w && $h) { return ($w, $h); }
2963  ($w, $h) = jpeg_size ($body);
2964  if ($w && $h) { return ($w, $h); }
2965  return png_size ($body);
2966  if ($w && $h) { return ($w, $h); }
2967  return pnm_size ($body);
2968}
2969
2970
2971# returns the full path of the named program, or undef.
2972#
2973sub which($) {
2974  my ($prog) = @_;
2975  foreach (split (/:/, $ENV{PATH})) {
2976    my $path = "$_/$prog";
2977    if (-x $path) {
2978      return $path;
2979    }
2980  }
2981  return undef;
2982}
2983
2984
2985# Like rand(), but chooses numbers with a bell curve distribution.
2986sub bellrand(;$) {
2987  ($_) = @_;
2988  $_ = 1.0 unless defined($_);
2989  $_ /= 3.0;
2990  return (rand($_) + rand($_) + rand($_));
2991}
2992
2993
2994sub exit_cleanup() {
2995  x_cleanup();
2996  print STDERR "$progname: exiting\n" if ($verbose_warnings);
2997  if (@pids_to_kill) {
2998    print STDERR blurb() . "killing: " . join(' ', @pids_to_kill) . "\n";
2999    kill ('TERM', @pids_to_kill);
3000  }
3001}
3002
3003sub signal_cleanup($) {
3004  my ($sig) = @_;
3005  print STDERR blurb() . (defined($sig)
3006                          ? "caught signal $sig."
3007                          : "exiting.")
3008                       . "\n"
3009    if ($verbose_exec || $verbose_warnings);
3010  exit 1;
3011}
3012
3013
3014
3015##############################################################################
3016#
3017# Generating a list of urls only
3018#
3019##############################################################################
3020
3021sub url_only_output() {
3022  do {
3023    my ($base, $img) = pick_image;
3024    if ($img) {
3025      $base =~ s/ /%20/g;
3026      $img  =~ s/ /%20/g;
3027      print "$img $base\n";
3028    }
3029  } while (1);
3030}
3031
3032##############################################################################
3033#
3034# Running as an xscreensaver module, or as a web page imagemap
3035#
3036##############################################################################
3037
3038my ($image_png, $image_tmp1, $image_tmp2);
3039{
3040  my $seed = rand(0xFFFFFFFF);
3041  $image_png = sprintf ("%s/webcollage-%08x",
3042                        ($ENV{TMPDIR} ? $ENV{TMPDIR} : "/tmp"),
3043                        $seed);
3044  $image_tmp1 = $image_png . '-1.png';
3045  $image_tmp2 = $image_png . '-2.png';
3046  $image_png .= '.png';
3047}
3048
3049
3050my $filter_cmd = undef;
3051my $post_filter_cmd = undef;
3052my $background = undef;
3053
3054my @imagemap_areas = ();
3055my $imagemap_html_tmp = undef;
3056my $imagemap_jpg_tmp = undef;
3057
3058
3059my $img_width;            # size of the image being generated.
3060my $img_height;
3061
3062my $delay = 2;
3063
3064sub x_cleanup() {
3065  unlink $image_png, $image_tmp1, $image_tmp2;
3066  unlink $imagemap_html_tmp, $imagemap_jpg_tmp
3067    if (defined ($imagemap_html_tmp));
3068}
3069
3070
3071# Like system, but prints status about exit codes, and kills this process
3072# with whatever signal killed the sub-process, if any.
3073#
3074sub nontrapping_system(@) {
3075  $! = 0;
3076
3077  $_ = join(" ", @_);
3078  s/\"[^\"]+\"/\"...\"/g;
3079
3080  LOG ($verbose_exec, "executing \"$_\"");
3081
3082  my $rc = system @_;
3083
3084  if ($rc == 0) {
3085    LOG ($verbose_exec, "subproc exited normally.");
3086  } elsif (($rc & 0xff) == 0) {
3087    $rc >>= 8;
3088    LOG ($verbose_exec, "subproc exited with status $rc.");
3089  } else {
3090    if ($rc & 0x80) {
3091      LOG ($verbose_exec, "subproc dumped core.");
3092      $rc &= ~0x80;
3093    }
3094    LOG ($verbose_exec, "subproc died with signal $rc.");
3095    # die that way ourselves.
3096    kill $rc, $$;
3097  }
3098
3099  return $rc;
3100}
3101
3102
3103# Creates a solid-colored PNG.
3104#
3105sub pngmake($$$$) {
3106  my ($outfile, $bgcolor, $w, $h) = @_;
3107
3108  my @cmd;
3109  if ($webcollage_helper) {
3110    @cmd = ($webcollage_helper, $bgcolor, $w, $h, $outfile);
3111  } else {
3112    @cmd = ($convert_cmd, '-size', "${w}x${h}", "xc:$bgcolor", $outfile);
3113  }
3114
3115  my $rc = nontrapping_system (@cmd);
3116  if ($rc != 0) {
3117    LOG(0, "failed to create $bgcolor image: \"$outfile\"");
3118    exit(1);
3119  }
3120}
3121
3122
3123sub pick_root_displayer() {
3124  my @names = ();
3125
3126  if ($cocoa_p) {
3127    # see "xscreensaver/hacks/webcollage-cocoa.m"
3128    return "echo COCOA LOAD ";
3129  }
3130
3131  foreach my $cmd (@root_displayers) {
3132    $_ = $cmd;
3133    my ($name) = m/^([^ ]+)/;
3134    push @names, "\"$name\"";
3135    LOG ($verbose_exec, "looking for $name...");
3136    foreach my $dir (split (/:/, $ENV{PATH})) {
3137      LOG ($verbose_exec, "  checking $dir/$name");
3138      return $cmd if (-x "$dir/$name");
3139    }
3140  }
3141
3142  $names[$#names] = "or " . $names[$#names];
3143  error "none of: " . join (", ", @names) . " were found on \$PATH.";
3144}
3145
3146
3147my $png_to_root_window_cmd = undef;
3148
3149
3150sub x_or_image_output($) {
3151  my ($window_id) = @_;
3152
3153  # Adjust the PATH for OS X 10.10.
3154  #
3155  $_ = $0;
3156  s:/[^/]*$::;
3157  s/([^a-zA-Z0-9._\-+\/])/\\$1/g;
3158  $ENV{PATH} = "$_:$ENV{PATH}";
3159
3160  # Check for our helper program, to see whether we need to use imagemagick.
3161  #
3162  $_ = "webcollage-helper";
3163
3164  if (! defined ($webcollage_helper)) {
3165    $webcollage_helper = which ($_);
3166  }
3167
3168  if (defined ($webcollage_helper)) {
3169    LOG ($verbose_decode, "found \"$webcollage_helper\"");
3170  } else {
3171    LOG (($verbose_decode || $verbose_load), "no $_ program");
3172  }
3173
3174  if ($cocoa_p && !defined ($webcollage_helper)) {
3175    error ("webcollage-helper not found in Cocoa-mode!");
3176  }
3177
3178  if (!$cocoa_p && defined ($webcollage_helper)) {
3179    foreach ($image_png, $image_tmp1, $image_tmp2) {
3180      s/\.png$/.jpg/si;
3181    }
3182  }
3183
3184  # make sure the various programs we execute exist, right up front.
3185  #
3186  my @progs = ();
3187
3188  push @progs, $convert_cmd unless defined($webcollage_helper);
3189
3190  foreach (@progs) {
3191    which ($_) || error "$_ not found on \$PATH.";
3192  }
3193
3194  # find a root-window displayer program.
3195  #
3196  if (!$no_output_p) {
3197    $png_to_root_window_cmd = pick_root_displayer();
3198  }
3199
3200  if (defined ($window_id)) {
3201    error ("-window-id only works if xscreensaver-getimage is installed")
3202      unless ($png_to_root_window_cmd =~ m/^xscreensaver-getimage\b/);
3203
3204    error ("unparsable window id: $window_id")
3205      unless ($window_id =~ m/^\d+$|^0x[\da-f]+$/i);
3206    $png_to_root_window_cmd =~ s/--?root\b/$window_id/ ||
3207      error ("unable to munge displayer: $png_to_root_window_cmd");
3208  }
3209
3210  if (!$img_width || !$img_height) {
3211
3212    if (!defined ($window_id) &&
3213        defined ($ENV{XSCREENSAVER_WINDOW})) {
3214      $window_id = $ENV{XSCREENSAVER_WINDOW};
3215    }
3216
3217    if (!defined ($window_id)) {
3218      $_ = "xdpyinfo";
3219      which ($_) || error "$_ not found on \$PATH.";
3220      $_ = `$_`;
3221      ($img_width, $img_height) = m/dimensions: *(\d+)x(\d+) /;
3222      if (!defined($img_height)) {
3223        error "xdpyinfo failed.";
3224      }
3225    } else {  # we have a window id
3226      $_ = "xwininfo";
3227      which ($_) || error "$_ not found on \$PATH.";
3228      $_ .= " -id $window_id";
3229      $_ = `$_`;
3230      ($img_width, $img_height) = m/^\s*Width:\s*(\d+)\n\s*Height:\s*(\d+)\n/m;
3231
3232      if (!defined($img_height)) {
3233        error "xwininfo failed.";
3234      }
3235    }
3236  }
3237
3238  my $bgcolor = "#000000";
3239  my $bgimage = undef;
3240
3241  if ($background) {
3242    if ($background =~ m/^\#[0-9a-f]+$/i) {
3243      $bgcolor = $background;
3244
3245    } elsif (-r $background) {
3246      $bgimage = $background;
3247
3248    } elsif (! $background =~ m@^[-a-z0-9 ]+$@i) {
3249      error "not a color or readable file: $background";
3250
3251    } else {
3252      # default to assuming it's a color
3253      $bgcolor = $background;
3254    }
3255  }
3256
3257  # Create the sold-colored base image.
3258  #
3259  LOG ($verbose_decode, "creating base image: ${img_width}x${img_height}");
3260  $_ = pngmake ($image_png, $bgcolor, $img_width, $img_height);
3261
3262  # Paste the default background image in the middle of it.
3263  #
3264  if ($bgimage) {
3265    open (my $in, '<:raw', $bgimage) || error ("$bgimage: $!");
3266    local $/ = undef;  # read entire file
3267    my $body = <$in>;
3268    close ($in) || error ("$bgimage: $!");
3269    paste_image ('init', $image_png, $body, 'init', 1);
3270  }
3271
3272  clearlog();
3273
3274  while (1) {
3275    my ($base, $img) = pick_image();
3276    my $source = $current_state;
3277    $current_state = "loadimage";
3278    if ($img) {
3279      my ($headers, $body) = get_document ($img, $base);
3280      if ($body) {
3281        paste_image ($base, $img, $body, $source);
3282        $body = undef;
3283      }
3284    }
3285    $current_state = "idle";
3286    $load_method = "none";
3287
3288    unlink $image_tmp1, $image_tmp2;
3289    sleep $delay;
3290  }
3291}
3292
3293sub paste_image($$$$;$) {
3294  my ($base, $img, $body, $source, $init_p) = @_;
3295
3296  $current_state = "paste";
3297
3298  $suppress_audit = 0;
3299
3300  LOG ($verbose_decode, "got $img (" . length($body) . ")");
3301
3302  my ($iw, $ih) = image_size ($body);
3303  if (!$iw || !$ih) {
3304    LOG (($verbose_decode || $verbose_load),
3305         "not a GIF, JPG, or PNG" .
3306         (($body =~ m@<(base|html|head|body|script|table|a href)>@i)
3307          ? " (looks like HTML)" : "") .
3308         ": $img");
3309    $suppress_audit = 1;
3310    $body = undef;
3311    return 0;
3312  }
3313
3314  if ($iw <= 0 || $ih <= 0 || $iw > 9999 || $ih > 9999) {
3315    LOG (($verbose_decode || $verbose_load),
3316         "ludicrous image dimensions: $iw x $ih (" . length($body) .
3317         "): $img");
3318    $body = undef;
3319    return 0;
3320  }
3321
3322  open (my $out, '>:raw', $image_tmp1) || error ("writing $image_tmp1: $!");
3323  (print $out $body) || error ("writing $image_tmp1: $!");
3324  close ($out) || error ("writing $image_tmp1: $!");
3325
3326  record_success ($load_method, $img, $base);
3327
3328  my $ow = $iw;  # used only for error messages
3329  my $oh = $ih;
3330
3331  # don't just tack this onto the front of the pipeline -- we want it to
3332  # be able to change the size of the input image.
3333  #
3334  if ($filter_cmd && !$init_p) {
3335    LOG ($verbose_decode, "running $filter_cmd");
3336
3337    # #### Historically, $filter_cmd read and write PPM files.
3338    #      This is doing PNG or JPEG now.  I'll bet nobody uses this.
3339
3340    my $rc = nontrapping_system "($filter_cmd) < $image_tmp1 >$image_tmp2";
3341    if ($rc != 0) {
3342      LOG(($verbose_decode || $verbose_load),
3343          "failed command: \"$filter_cmd\"");
3344      LOG(($verbose_decode || $verbose_load),
3345          "failed URL: \"$img\" (${ow}x$oh)");
3346      return;
3347    }
3348    rename ($image_tmp2, $image_tmp1);
3349
3350    # re-get the width/height in case the filter resized it.
3351    open (my $imgf, '<:raw', $image_tmp1) || return 0;
3352    my $b = '';
3353    sysread ($imgf, $b, 10240);
3354    close $imgf;
3355    ($iw, $ih) = image_size ($b);
3356    return 0 unless ($iw && $ih);
3357  }
3358
3359  my $target_w = $img_width;   # max rectangle into which the image must fit
3360  my $target_h = $img_height;
3361
3362  my $scale = 1.0;
3363
3364  my $crop_x = 0;     # the sub-rectangle of the image
3365  my $crop_y = 0;     # that we will actually paste.
3366  my $crop_w = $iw;
3367  my $crop_h = $ih;
3368  my $x = 0;
3369  my $y = 0;
3370
3371  if (!$init_p) {
3372
3373    # Usually scale the image to fit on the screen -- but sometimes scale it
3374    # to fit on half or a quarter of the screen.  (We do this by reducing the
3375    # size of the target rectangle.)  Note that the image is not merely scaled
3376    # to fit; we instead cut the image in half repeatedly until it fits in the
3377    # target rectangle -- that gives a wider distribution of sizes.
3378    #
3379    if (rand() < 0.3) { $target_w /= 2; $target_h /= 2; } # reduce target rect
3380    if (rand() < 0.3) { $target_w /= 2; $target_h /= 2; }
3381
3382    if ($iw > $target_w || $ih > $target_h) {
3383      while ($iw > $target_w ||
3384             $ih > $target_h) {
3385        $iw = int($iw / 2);
3386        $ih = int($ih / 2);
3387        $scale /= 2;
3388      }
3389      if ($iw <= 10 || $ih <= 10) {
3390        LOG ($verbose_decode, "scaling ${ow}x${oh} to ${iw}x$ih" .
3391             " would have been bogus.");
3392        return 0;
3393      }
3394
3395      $crop_w = $iw;
3396      $crop_h = $ih;
3397
3398      LOG ($verbose_decode, "scaling ${ow}x${oh} to ${iw}x$ih ($scale)");
3399    }
3400
3401
3402    my $src = $image_tmp1;
3403
3404    # The chance that we will randomly crop out a section of an image starts
3405    # out fairly low, but goes up for images that are very large, or images
3406    # that have ratios that make them look like banners (we try to avoid
3407    # banner images entirely, but they slip through when the IMG tags didn't
3408    # have WIDTH and HEIGHT specified.)
3409    #
3410    my $crop_chance = 0.2;
3411    if ($iw > $img_width * 0.4 || $ih > $img_height * 0.4) {
3412      $crop_chance += 0.2;
3413    }
3414    if ($iw > $img_width * 0.7 || $ih > $img_height * 0.7) {
3415      $crop_chance += 0.2;
3416    }
3417    if ($min_ratio && ($iw * $min_ratio) > $ih) {
3418      $crop_chance += 0.7;
3419    }
3420
3421    if ($crop_chance > 0.1) {
3422      LOG ($verbose_decode, "crop chance: $crop_chance");
3423    }
3424
3425    if (rand() < $crop_chance) {
3426
3427      my $ow = $crop_w;
3428      my $oh = $crop_h;
3429
3430      if ($crop_w > $min_width) {
3431        # if it's a banner, select the width linearly.
3432        # otherwise, select a bell.
3433        my $r = (($min_ratio && ($iw * $min_ratio) > $ih)
3434                 ? rand()
3435                 : bellrand());
3436        $crop_w = $min_width + int ($r * ($crop_w - $min_width));
3437        $crop_x = int (rand() * ($ow - $crop_w));
3438      }
3439      if ($crop_h > $min_height) {
3440        # height always selects as a bell.
3441        $crop_h = $min_height + int (bellrand() * ($crop_h - $min_height));
3442        $crop_y = int (rand() * ($oh - $crop_h));
3443      }
3444
3445      # Clip it to the actual post-scaling image size.
3446      if ($crop_x + $crop_w > $iw) { $crop_w = $iw - $crop_x; }
3447      if ($crop_y + $crop_h > $ih) { $crop_h = $ih - $crop_y; }
3448      if ($crop_x < 0) { $crop_w += $crop_x; $crop_x = 0; }
3449      if ($crop_y < 0) { $crop_h += $crop_y; $crop_y = 0; }
3450
3451      if ($crop_x != 0   || $crop_y != 0 ||
3452          $crop_w != $iw || $crop_h != $ih) {
3453        LOG ($verbose_decode,
3454             "randomly cropping to ${crop_w}x$crop_h \@ $crop_x,$crop_y");
3455      }
3456    }
3457
3458    # Where the image should logically land -- this might be negative.
3459    #
3460    $x = int((rand() * ($img_width  + $crop_w/2)) - $crop_w*3/4);
3461    $y = int((rand() * ($img_height + $crop_h/2)) - $crop_h*3/4);
3462
3463    # if we have chosen to paste the image outside of the rectangle of the
3464    # screen, then we need to crop it.
3465    #
3466    if ($x < 0 ||
3467        $y < 0 ||
3468        $x + $crop_w > $img_width ||
3469        $y + $crop_h > $img_height) {
3470
3471      LOG ($verbose_decode,
3472           "cropping for effective paste of ${crop_w}x${crop_h} \@ $x,$y");
3473
3474      if ($x < 0) { $crop_x -= $x; $crop_w += $x; $x = 0; }
3475      if ($y < 0) { $crop_y -= $y; $crop_h += $y; $y = 0; }
3476
3477      if ($x + $crop_w >= $img_width)  { $crop_w = $img_width  - $x - 1; }
3478      if ($y + $crop_h >= $img_height) { $crop_h = $img_height - $y - 1; }
3479    }
3480
3481    # If any cropping needs to happen, add pnmcut.
3482    #
3483    if ($crop_x != 0   || $crop_y != 0 ||
3484        $crop_w != $iw || $crop_h != $ih) {
3485      $iw = $crop_w;
3486      $ih = $crop_h;
3487      LOG ($verbose_decode, "cropping to ${crop_w}x$crop_h \@ " .
3488           "$crop_x,$crop_y");
3489    }
3490
3491    LOG ($verbose_decode, "pasting ${iw}x$ih \@ $x,$y in $image_png");
3492  }
3493
3494  my @cmd;
3495  if (defined ($webcollage_helper)) {
3496    @cmd = ($webcollage_helper,
3497            $image_tmp1, $image_png,
3498            $scale, $opacity,
3499            $crop_x, $crop_y, $x, $y,
3500            $iw, $ih);
3501  } else {
3502    @cmd = ($convert_cmd,
3503            $image_png,
3504            '(',
3505            $image_tmp1 . '[0]',
3506            '-scale', sprintf("%.2f%%", 100 * $scale),
3507            '-crop', "${iw}x${ih}+${crop_x}+${crop_y}",
3508            '-geometry', "+${x}+${y}",
3509
3510            ($init_p ? () :
3511             (
3512              # Blurry edges with rounded corners
3513              '-alpha', 'set',
3514              '-virtual-pixel', 'transparent',
3515              '-channel', 'A',
3516              '-blur', '0x12',
3517              '-level', '50%,100%',
3518
3519              # Overall transparency
3520              '-evaluate', 'multiply', $opacity,
3521
3522              '+channel',
3523             )),
3524            ')',
3525            '-composite',
3526            '+repage',
3527            '-strip',
3528            $image_png);
3529  }
3530
3531  #### $verbose_decode should mean 2>/dev/null
3532
3533  my $rc = nontrapping_system (@cmd);
3534
3535  if (-z $image_png) {
3536    LOG (1, "failed command: \"@cmd\"");
3537    print STDERR "\naudit log:\n\n\n";
3538    print STDERR ("#" x 78) . "\n";
3539    print STDERR blurb() . "$image_png has zero size\n";
3540    showlog();
3541    print STDERR "\n\n";
3542    exit (1);
3543  }
3544
3545  if ($rc != 0) {
3546    LOG (($verbose_decode || $verbose_load), "failed command: \"@cmd\"");
3547    LOG (($verbose_decode || $verbose_load),
3548         "failed URL: \"$img\" (${ow}x$oh)");
3549    return;
3550  }
3551
3552  my $target = "$image_png";
3553
3554  # don't just tack this onto the end of the pipeline -- we don't want it
3555  # to end up in $image_png, because we don't want the results to be
3556  # cumulative.
3557  #
3558  if ($post_filter_cmd) {
3559
3560    # #### Historically, $post_filter_cmd read and write PPM files.
3561    #      This is doing PNG or JPEG now.  I'll bet nobody uses this.
3562
3563    $target = $image_tmp1;
3564    my $cmd = "($post_filter_cmd) < $image_png > $target";
3565    $rc = nontrapping_system ($cmd);
3566    if ($rc != 0) {
3567      LOG ($verbose_decode, "filter failed: \"$post_filter_cmd\"\n");
3568      return;
3569    }
3570  }
3571
3572  if (!$no_output_p) {
3573    my $tsize = (stat($target))[7];
3574    if ($tsize > 200) {
3575      my $cmd = "$png_to_root_window_cmd $target";
3576
3577      # xv seems to hate being killed.  it tends to forget to clean
3578      # up after itself, and leaves windows around and colors allocated.
3579      # I had this same problem with vidwhacker, and I'm not entirely
3580      # sure what I did to fix it.  But, let's try this: launch xv
3581      # in the background, so that killing this process doesn't kill it.
3582      # it will die of its own accord soon enough.  So this means we
3583      # start pumping bits to the root window in parallel with starting
3584      # the next network retrieval, which is probably a better thing
3585      # to do anyway.
3586      #
3587      $cmd .= " &" unless ($cocoa_p);
3588
3589      $rc = nontrapping_system ($cmd);
3590
3591      if ($rc != 0) {
3592        LOG (($verbose_decode || $verbose_load), "display failed: \"$cmd\"");
3593        return;
3594      }
3595
3596    } else {
3597      LOG ($verbose_decode, "$target size is $tsize");
3598    }
3599  }
3600
3601  if (defined($source)) {
3602    $source .= "-" . stats_of($source);
3603    print STDOUT "image: ${iw}x${ih} @ $x,$y $base $source\n"
3604      if ($verbose_imgmap);
3605    if ($imagemap_base) {
3606      update_imagemap ($base, $x, $y, $iw, $ih,
3607                       $image_png, $img_width, $img_height);
3608    }
3609  }
3610
3611  clearlog();
3612
3613  return 1;
3614}
3615
3616
3617sub update_imagemap($$$$$$$$) {
3618  my ($url, $x, $y, $w, $h, $image_png, $image_width, $image_height) = @_;
3619
3620  $current_state = "imagemap";
3621
3622  my $max_areas = 200;
3623
3624  $url = html_quote ($url);
3625  push @imagemap_areas, [$x, $y, $w, $h, $url];
3626  shift @imagemap_areas if (@imagemap_areas > $max_areas);
3627  LOG ($verbose_decode, "area: $x,$y,$w,$h");
3628
3629  my $map_name = $imagemap_base;
3630  $map_name =~ s@^.*/@@;
3631  $map_name = 'collage' if ($map_name eq '');
3632
3633  my $imagemap_html = $imagemap_base . ".html";
3634  my $imagemap_jpg  = $imagemap_base . ".jpg";
3635  my $imagemap_jpg2 = $imagemap_jpg;
3636  $imagemap_jpg2 =~ s@^.*/@@gs;
3637
3638  if (!defined ($imagemap_html_tmp)) {
3639    $imagemap_html_tmp = $imagemap_html . sprintf (".%08x", rand(0xffffffff));
3640    $imagemap_jpg_tmp  = $imagemap_jpg  . sprintf (".%08x", rand(0xffffffff));
3641  }
3642
3643  # Read the imagemap html file (if any) to get a template.
3644  #
3645  my $template_html = '';
3646  {
3647    if (open (my $in, '<', $imagemap_html)) {
3648      local $/ = undef;  # read entire file
3649      $template_html = <$in>;
3650      close $in;
3651      LOG ($verbose_decode, "read template $imagemap_html");
3652    }
3653
3654    if (! ($template_html =~ m/\.webcollage_box\b/s)) {  # missing or old
3655      $template_html =
3656       '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
3657	  "http://www.w3.org/TR/html4/loose.dtd">
3658<HTML>
3659 <HEAD>
3660  <BASE TARGET="_new">
3661  <meta HTTP-EQUIV="Refresh" content="60" />
3662  <TITLE>WebCollage</TITLE>
3663  <STYLE TYPE="text/css">
3664   <!--
3665    body { color: #FFF; max-width: 100%; }
3666    .webcollage_date, .webcollage_size {
3667      display: block; margin-top: 4px; font-size: 7pt; color: #888;
3668    }
3669    .webcollage_date { float: left; }
3670    .webcollage_size { float: right; }
3671
3672    .webcollage_frame {
3673      overflow: hidden;
3674      position: relative;
3675      padding-bottom: 56.25%;
3676      padding-bottom: 75%; /* 4:3 aspect ratio */
3677      border: 1px solid #888;
3678      background: #000;
3679    }
3680
3681    .webcollage_box {
3682       position: absolute; top: 0; left: 0;
3683       border: 0; margin: 0; padding: 0;
3684       width:  100%;
3685       height: 100%;
3686    }
3687
3688    .webcollage_box > img { width: 100%; height: 100%; border: 0; }
3689
3690    .webcollage_frame > a {
3691      display: block;
3692      position: absolute;
3693      border-radius: 16px;
3694    }
3695
3696    .webcollage_frame > a:hover {
3697      background: rgba(1,1,1,.25);
3698    }
3699   -->
3700  </STYLE>
3701 </HEAD>
3702 <BODY>
3703  <DIV CLASS="webcollage_frame">
3704   <DIV CLASS="webcollage_box"></DIV>
3705  </DIV>
3706 </BODY>
3707</HTML>
3708';
3709
3710      LOG ($verbose_decode, "created dummy template");
3711    }
3712  }
3713
3714  # Write the jpg to a tmp file
3715  #
3716  {
3717    my @cmd;
3718    if (defined($webcollage_helper)) {
3719      @cmd = ('cp', '-p', $image_png, $imagemap_jpg_tmp);
3720    } else {
3721      @cmd = ($convert_cmd, $image_png, 'jpeg:' . $imagemap_jpg_tmp);
3722    }
3723    my $rc = nontrapping_system (@cmd);
3724    if ($rc != 0) {
3725      error ("imagemap jpeg failed: \"@cmd\"\n");
3726    }
3727  }
3728
3729  # Write the html to a tmp file
3730  #
3731  {
3732    my $body = $template_html;
3733    my $img = ("   <DIV CLASS=\"webcollage_box\">" .
3734                    "<IMG SRC=\"$imagemap_jpg2\">" .
3735                   "</DIV>\n");
3736    foreach my $a (@imagemap_areas) {
3737      my ($x, $y, $w, $h, $u) = @$a;
3738      $x /= $img_width  / 100;
3739      $y /= $img_height / 100;
3740      $w /= $img_width  / 100;
3741      $h /= $img_height / 100;
3742      foreach ($x, $y, $w, $h) { $_ = sprintf("%.1f%%", $_); }
3743      $u = html_quote($u);
3744      $img .= ("   <A HREF=\"$u\" STYLE=\"" .
3745               "left:$x;top:$y;width:$w;height:$h\"/>\n");
3746    }
3747
3748    $img = ("<DIV CLASS=\"webcollage_frame\">\n" .
3749            $img .
3750            "  </DIV>\n");
3751    $body =~ s@<DIV \s+ CLASS=[\"\']webcollage_frame[\"\']>
3752                .*? </DIV> .*? </DIV>@$img@sex ||
3753      error ("$imagemap_html_tmp: unable to splice image");
3754
3755
3756    # if there are magic webcollage spans in the html, update those too.
3757    #
3758    {
3759      my @st = stat ($imagemap_jpg_tmp);
3760      my $date = strftime("%d-%b-%Y %l:%M:%S %p %Z", localtime($st[9]));
3761      my $size = int(($st[7] / 1024) + 0.5) . "K";
3762      $body =~ s@(<SPAN\s+CLASS=\"webcollage_date\">).*?(</SPAN>)@$1$date$2@si;
3763      $body =~ s@(<SPAN\s+CLASS=\"webcollage_size\">).*?(</SPAN>)@$1$size$2@si;
3764    }
3765
3766    open (my $out, '>', $imagemap_html_tmp) || error("$imagemap_html_tmp: $!");
3767    (print $out $body)                      || error("$imagemap_html_tmp: $!");
3768    close ($out)                            || error("$imagemap_html_tmp: $!");
3769    LOG ($verbose_decode, "wrote $imagemap_html_tmp");
3770  }
3771
3772  # Rename the two tmp files to the real files
3773  #
3774  rename ($imagemap_html_tmp, $imagemap_html) ||
3775    error "renaming $imagemap_html_tmp to $imagemap_html";
3776  LOG ($verbose_decode, "wrote $imagemap_html");
3777
3778  if ($imagemap_jpg_tmp ne $image_png) {
3779    rename ($imagemap_jpg_tmp,  $imagemap_jpg) ||
3780      error "renaming $imagemap_jpg_tmp to $imagemap_jpg";
3781    LOG ($verbose_decode, "wrote $imagemap_jpg");
3782  }
3783}
3784
3785
3786# Figure out what the proxy server should be, either from environment
3787# variables or by parsing the output of the (MacOS) program "scutil",
3788# which tells us what the system-wide proxy settings are.
3789#
3790sub set_proxy() {
3791
3792  if (! defined($http_proxy)) {
3793    # historical suckage: the environment variable name is lower case.
3794    $http_proxy = $ENV{http_proxy} || $ENV{HTTP_PROXY};
3795  }
3796
3797  if (defined ($http_proxy)) {
3798    if ($http_proxy && $http_proxy =~ m@^https?://([^/]*)/?$@ ) {
3799      # historical suckage: allow "http://host:port" as well as "host:port".
3800      $http_proxy = $1;
3801    }
3802
3803  } else {
3804    my $proxy_data = `scutil --proxy 2>/dev/null`;
3805    my ($server) = ($proxy_data =~ m/\bHTTPProxy\s*:\s*([^\s]+)/s);
3806    my ($port)   = ($proxy_data =~ m/\bHTTPPort\s*:\s*([^\s]+)/s);
3807    # Note: this ignores the "ExceptionsList".
3808    if ($server) {
3809      $http_proxy = $server;
3810      $http_proxy .= ":$port" if $port;
3811    }
3812  }
3813
3814  delete $ENV{http_proxy};
3815  delete $ENV{HTTP_PROXY};
3816  delete $ENV{https_proxy};
3817  delete $ENV{HTTPS_PROXY};
3818  delete $ENV{PERL_LWP_ENV_PROXY};
3819
3820  if ($http_proxy) {
3821    $http_proxy = 'http://' . $http_proxy;
3822    LOG ($verbose_net, "proxy server: $http_proxy");
3823  } else {
3824    $http_proxy = undef;  # for --proxy ''
3825  }
3826}
3827
3828
3829sub init_signals() {
3830
3831  $SIG{HUP}  = \&signal_cleanup;
3832  $SIG{INT}  = \&signal_cleanup;
3833  $SIG{QUIT} = \&signal_cleanup;
3834  $SIG{ABRT} = \&signal_cleanup;
3835  $SIG{KILL} = \&signal_cleanup;
3836  $SIG{TERM} = \&signal_cleanup;
3837
3838  # Need this so that if subprocess filters die, we don't die.
3839  $SIG{PIPE} = 'IGNORE';
3840}
3841
3842END { exit_cleanup(); }
3843
3844
3845sub main() {
3846  $| = 1;
3847  srand(time ^ $$);
3848
3849  my $verbose = 0;
3850  my $dict;
3851  my $driftnet_cmd = 0;
3852
3853  $current_state = "init";
3854  $load_method = "none";
3855
3856  my $root_p = 0;
3857  my $window_id = undef;
3858
3859  while ($#ARGV >= 0) {
3860    $_ = shift @ARGV;
3861    if (m/^--?d(i(s(p(l(a(y)?)?)?)?)?)?$/s) {
3862      $ENV{DISPLAY} = shift @ARGV;
3863    } elsif (m/^--?root$/s) {
3864      $root_p = 1;
3865    } elsif (m/^--?window-id$/s) {
3866      $window_id = shift @ARGV;
3867      $root_p = 1;
3868    } elsif (m/^--?no-output$/s) {
3869      $no_output_p = 1;
3870    } elsif (m/^--?urls(-only)?$/s) {
3871      $urls_only_p = 1;
3872      $no_output_p = 1;
3873    } elsif (m/^--?cocoa$/s) {
3874      $cocoa_p = 1;
3875    } elsif (m/^--?imagemap$/s) {
3876      $imagemap_base = shift @ARGV;
3877      $no_output_p = 1;
3878    } elsif (m/^--?verbose$/s) {
3879      $verbose++;
3880    } elsif (m/^-v+$/) {
3881      $verbose += length($_)-1;
3882    } elsif (m/^--?delay$/s) {
3883      $delay = shift @ARGV;
3884    } elsif (m/^--?timeout$/s) {
3885      $http_timeout = shift @ARGV;
3886    } elsif (m/^--?filter$/s) {
3887      $filter_cmd = shift @ARGV;
3888    } elsif (m/^--?filter2$/s) {
3889      $post_filter_cmd = shift @ARGV;
3890    } elsif (m/^--?(background|bg)$/s) {
3891      $background = shift @ARGV;
3892    } elsif (m/^--?size$/s) {
3893      $_ = shift @ARGV;
3894      if (m@^(\d+)x(\d+)$@) {
3895        $img_width = $1;
3896        $img_height = $2;
3897      } else {
3898        error "argument to \"--size\" must be of the form \"640x400\"";
3899      }
3900    } elsif (m/^--?(http-)?proxy$/s) {
3901      $http_proxy = shift @ARGV;
3902    } elsif (m/^--?dict(ionary)?$/s) {
3903      $dict = shift @ARGV;
3904    } elsif (m/^--?opacity$/s) {
3905      $opacity = shift @ARGV;
3906      error ("opacity must be between 0.0 and 1.0")
3907        if ($opacity <= 0 || $opacity > 1);
3908    } elsif (m/^--?driftnet$/s) {
3909      @search_methods = ( 100, "driftnet", \&pick_from_driftnet );
3910      if (! ($ARGV[0] =~ m/^-/)) {
3911        $driftnet_cmd = shift @ARGV;
3912      } else {
3913        $driftnet_cmd = $default_driftnet_cmd;
3914      }
3915    } elsif (m/^--?dir(ectory)?$/s) {
3916      @search_methods = ( 100, "local", \&pick_from_local_dir );
3917      if (! ($ARGV[0] =~ m/^-/)) {
3918        $local_dir = shift @ARGV;
3919      } else {
3920        error ("local directory path must be set")
3921      }
3922    } elsif (m/^--?fps$/s) {
3923      # -fps only works on MacOS, via "webcollage-cocoa.m".
3924      # Ignore it if passed to this script in an X11 context.
3925    } elsif (m/^--?debug$/s) {
3926      my $which = shift @ARGV;
3927      my @rest = @search_methods;
3928      my $ok = 0;
3929      while (@rest) {
3930        my $pct  = shift @rest;
3931        my $name = shift @rest;
3932        my $tfn  = shift @rest;
3933
3934        if ($name eq $which) {
3935          @search_methods = (100, $name, $tfn);
3936          $ok = 1;
3937          last;
3938        }
3939      }
3940      error "no such search method as \"$which\"" unless ($ok);
3941      LOG (1, "DEBUG: using only \"$which\"");
3942      $report_performance_interval = 30;
3943
3944    } else {
3945      print STDERR "unknown option: $_\n\n";
3946      print STDERR "$copyright\nusage: $progname " .
3947              "[--root] [--display dpy] [--verbose] [--debug which]\n" .
3948        "\t\t  [--timeout secs] [--delay secs] [--size WxH]\n" .
3949        "\t\t  [--no-output] [--urls-only] [--imagemap filename]\n" .
3950        "\t\t  [--background color] [--opacity f]\n" .
3951        "\t\t  [--filter cmd] [--filter2 cmd]\n" .
3952        "\t\t  [--dictionary dictionary-file] [--http-proxy host[:port]]\n" .
3953        "\t\t  [--driftnet [driftnet-program-and-args]]\n" .
3954        "\t\t  [--directory local-image-directory]\n" .
3955        "\n";
3956      exit 1;
3957    }
3958  }
3959
3960  if (!$root_p && !$no_output_p && !$cocoa_p) {
3961    print STDERR $copyright;
3962    error "the --root argument is mandatory (for now.)";
3963  }
3964
3965  if (!$no_output_p && !$cocoa_p && !$ENV{DISPLAY}) {
3966    error "\$DISPLAY is not set.";
3967  }
3968
3969
3970  if ($verbose == 1) {
3971    $verbose_imgmap   = 1;
3972    $verbose_warnings = 1;
3973
3974  } elsif ($verbose == 2) {
3975    $verbose_imgmap   = 1;
3976    $verbose_warnings = 1;
3977    $verbose_load     = 1;
3978
3979  } elsif ($verbose == 3) {
3980    $verbose_imgmap   = 1;
3981    $verbose_warnings = 1;
3982    $verbose_load     = 1;
3983    $verbose_filter   = 1;
3984
3985  } elsif ($verbose == 4) {
3986    $verbose_imgmap   = 1;
3987    $verbose_warnings = 1;
3988    $verbose_load     = 1;
3989    $verbose_filter   = 1;
3990    $verbose_net      = 1;
3991
3992  } elsif ($verbose == 5) {
3993    $verbose_imgmap   = 1;
3994    $verbose_warnings = 1;
3995    $verbose_load     = 1;
3996    $verbose_filter   = 1;
3997    $verbose_net      = 1;
3998    $verbose_decode   = 1;
3999
4000  } elsif ($verbose == 6) {
4001    $verbose_imgmap   = 1;
4002    $verbose_warnings = 1;
4003    $verbose_load     = 1;
4004    $verbose_filter   = 1;
4005    $verbose_net      = 1;
4006    $verbose_decode   = 1;
4007    $verbose_http     = 1;
4008
4009  } elsif ($verbose >= 7) {
4010    $verbose_imgmap   = 1;
4011    $verbose_warnings = 1;
4012    $verbose_load     = 1;
4013    $verbose_filter   = 1;
4014    $verbose_net      = 1;
4015    $verbose_decode   = 1;
4016    $verbose_http     = 1;
4017    $verbose_exec     = 1;
4018  }
4019
4020  if ($dict) {
4021    error ("$dict does not exist") unless (-f $dict);
4022    $wordlist = $dict;
4023  } else {
4024    pick_dictionary();
4025  }
4026
4027  if ($imagemap_base && !($img_width && $img_height)) {
4028    error ("--size WxH is required with --imagemap");
4029  }
4030
4031  if (defined ($local_dir)) {
4032    $_ = "xscreensaver-getimage-file";
4033    which ($_) || error "$_ not found on \$PATH.";
4034  }
4035
4036  init_signals();
4037  set_proxy();
4038
4039  spawn_driftnet ($driftnet_cmd) if ($driftnet_cmd);
4040
4041  if ($urls_only_p) {
4042    url_only_output ();
4043  } else {
4044    x_or_image_output ($window_id);
4045  }
4046}
4047
4048main();
4049exit (0);
4050