1# Copyright (c) 2018, ETH Zurich and UNC Chapel Hill. 2# All rights reserved. 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions are met: 6# 7# * Redistributions of source code must retain the above copyright 8# notice, this list of conditions and the following disclaimer. 9# 10# * Redistributions in binary form must reproduce the above copyright 11# notice, this list of conditions and the following disclaimer in the 12# documentation and/or other materials provided with the distribution. 13# 14# * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of 15# its contributors may be used to endorse or promote products derived 16# from this software without specific prior written permission. 17# 18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE 22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28# POSSIBILITY OF SUCH DAMAGE. 29# 30# Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de) 31 32import os 33import time 34import datetime 35import urllib 36import urllib2 37import socket 38import argparse 39import multiprocessing 40import xml.etree.ElementTree as ElementTree 41 42 43PER_PAGE = 500 44SORT = "date-posted-desc" 45URL = "https://api.flickr.com/services/rest/?method=flickr.photos.search&" \ 46 "api_key=%s&text=%s&sort=%s&per_page=%d&page=%d&min_upload_date=%s&" \ 47 "max_upload_date=%s&format=rest&extras=url_o,url_l,url_c,url_z,url_n" 48MAX_PAGE_REQUESTS = 5 49MAX_PAGE_TIMEOUT = 20 50MAX_IMAGE_REQUESTS = 3 51TIME_SKIP = 24 * 60 * 60 52MAX_DATE = time.time() 53MIN_DATE = MAX_DATE - TIME_SKIP 54 55 56def parse_args(): 57 parser = argparse.ArgumentParser() 58 parser.add_argument("--search_text", required=True) 59 parser.add_argument("--api_key", required=True) 60 parser.add_argument("--image_path", required=True) 61 parser.add_argument("--num_procs", type=int, default=10) 62 parser.add_argument("--max_days_without_image", type=int, default=365) 63 args = parser.parse_args() 64 return args 65 66 67def compose_url(page, api_key, text, min_date, max_date): 68 return URL % (api_key, text, SORT, PER_PAGE, page, 69 str(min_date), str(max_date)) 70 71 72def parse_page(page, api_key, text, min_date, max_date): 73 f = None 74 for _ in range(MAX_PAGE_REQUESTS): 75 try: 76 f = urllib2.urlopen(compose_url(page, api_key, text, min_date, 77 max_date), timeout=MAX_PAGE_TIMEOUT) 78 except socket.timeout: 79 continue 80 else: 81 break 82 83 if f is None: 84 return {'pages': '0', 85 'total': '0', 86 'page': '0', 87 'perpage': '0'}, tuple() 88 89 response = f.read() 90 root = ElementTree.fromstring(response) 91 92 if root.attrib["stat"] != "ok": 93 raise IOError 94 95 photos = [] 96 for photo in root.iter("photo"): 97 photos.append(photo.attrib) 98 99 return root.find("photos").attrib, photos 100 101 102class PhotoDownloader(object): 103 104 def __init__(self, image_path): 105 self.image_path = image_path 106 107 def __call__(self, photo): 108 image_name = "%s_%s.jpg" % (photo["id"], photo["secret"]) 109 path = os.path.join(self.image_path, image_name) 110 if not os.path.exists(path): 111 url = None 112 for url_suffix in ("o", "l", "k", "h", "b", "c", "z"): 113 url_attr = "url_%s" % url_suffix 114 if photo.get(url_attr) is not None: 115 url = photo.get(url_attr) 116 break 117 if url is not None: 118 print url 119 for _ in range(MAX_IMAGE_REQUESTS): 120 try: 121 urllib.urlretrieve(url, path) 122 except urllib.ContentTooShortError: 123 continue 124 else: 125 break 126 127 128def main(): 129 args = parse_args() 130 131 downloader = PhotoDownloader(args.image_path) 132 pool = multiprocessing.Pool(processes=args.num_procs) 133 134 num_pages = float("inf") 135 page = 0 136 137 min_date = MIN_DATE 138 max_date = MAX_DATE 139 140 days_in_row = 0; 141 142 search_text = args.search_text.replace(" ", "-") 143 144 while num_pages > page: 145 page += 1 146 147 metadata, photos = parse_page(page, args.api_key, search_text, 148 min_date, max_date) 149 150 num_pages = int(metadata["pages"]) 151 152 print 78 * "=" 153 print "Page:\t\t", page, "of", num_pages 154 print "Min-Date:\t", datetime.datetime.fromtimestamp(min_date) 155 print "Max-Date:\t", datetime.datetime.fromtimestamp(max_date) 156 print "Num-Photos:\t", len(photos) 157 print 78 * "=" 158 159 try: 160 pool.map_async(downloader, photos).get(1e10) 161 except KeyboardInterrupt: 162 pool.wait() 163 break 164 165 if page >= num_pages: 166 max_date -= TIME_SKIP 167 min_date -= TIME_SKIP 168 page = 0 169 170 if num_pages == 0: 171 days_in_row = days_in_row + 1 172 num_pages = float("inf") 173 174 print " No images in", days_in_row, "days in a row" 175 176 if days_in_row == args.max_days_without_image: 177 break 178 else: 179 days_in_row = 0 180 181 182if __name__ == "__main__": 183 main() 184