1# Copyright (c) 2018, ETH Zurich and UNC Chapel Hill.
2# All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are met:
6#
7#     * Redistributions of source code must retain the above copyright
8#       notice, this list of conditions and the following disclaimer.
9#
10#     * Redistributions in binary form must reproduce the above copyright
11#       notice, this list of conditions and the following disclaimer in the
12#       documentation and/or other materials provided with the distribution.
13#
14#     * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
15#       its contributors may be used to endorse or promote products derived
16#       from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
22# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28# POSSIBILITY OF SUCH DAMAGE.
29#
30# Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de)
31
32import os
33import time
34import datetime
35import urllib
36import urllib2
37import socket
38import argparse
39import multiprocessing
40import xml.etree.ElementTree as ElementTree
41
42
43PER_PAGE = 500
44SORT = "date-posted-desc"
45URL = "https://api.flickr.com/services/rest/?method=flickr.photos.search&" \
46      "api_key=%s&text=%s&sort=%s&per_page=%d&page=%d&min_upload_date=%s&" \
47      "max_upload_date=%s&format=rest&extras=url_o,url_l,url_c,url_z,url_n"
48MAX_PAGE_REQUESTS = 5
49MAX_PAGE_TIMEOUT = 20
50MAX_IMAGE_REQUESTS = 3
51TIME_SKIP = 24 * 60 * 60
52MAX_DATE = time.time()
53MIN_DATE = MAX_DATE - TIME_SKIP
54
55
56def parse_args():
57    parser = argparse.ArgumentParser()
58    parser.add_argument("--search_text", required=True)
59    parser.add_argument("--api_key", required=True)
60    parser.add_argument("--image_path", required=True)
61    parser.add_argument("--num_procs", type=int, default=10)
62    parser.add_argument("--max_days_without_image", type=int, default=365)
63    args = parser.parse_args()
64    return args
65
66
67def compose_url(page, api_key, text, min_date, max_date):
68    return URL % (api_key, text, SORT, PER_PAGE, page,
69                  str(min_date), str(max_date))
70
71
72def parse_page(page, api_key, text, min_date, max_date):
73    f = None
74    for _ in range(MAX_PAGE_REQUESTS):
75        try:
76            f = urllib2.urlopen(compose_url(page, api_key, text, min_date,
77                                            max_date), timeout=MAX_PAGE_TIMEOUT)
78        except socket.timeout:
79            continue
80        else:
81            break
82
83    if f is None:
84        return {'pages': '0',
85                'total': '0',
86                'page': '0',
87                'perpage': '0'}, tuple()
88
89    response = f.read()
90    root = ElementTree.fromstring(response)
91
92    if root.attrib["stat"] != "ok":
93        raise IOError
94
95    photos = []
96    for photo in root.iter("photo"):
97        photos.append(photo.attrib)
98
99    return root.find("photos").attrib, photos
100
101
102class PhotoDownloader(object):
103
104    def __init__(self, image_path):
105        self.image_path = image_path
106
107    def __call__(self, photo):
108        image_name = "%s_%s.jpg" % (photo["id"], photo["secret"])
109        path = os.path.join(self.image_path, image_name)
110        if not os.path.exists(path):
111            url = None
112            for url_suffix in ("o", "l", "k", "h", "b", "c", "z"):
113                url_attr = "url_%s" % url_suffix
114                if photo.get(url_attr) is not None:
115                    url = photo.get(url_attr)
116                    break
117            if url is not None:
118                print url
119                for _ in range(MAX_IMAGE_REQUESTS):
120                    try:
121                        urllib.urlretrieve(url, path)
122                    except urllib.ContentTooShortError:
123                        continue
124                    else:
125                        break
126
127
128def main():
129    args = parse_args()
130
131    downloader = PhotoDownloader(args.image_path)
132    pool = multiprocessing.Pool(processes=args.num_procs)
133
134    num_pages = float("inf")
135    page = 0
136
137    min_date = MIN_DATE
138    max_date = MAX_DATE
139
140    days_in_row = 0;
141
142    search_text = args.search_text.replace(" ", "-")
143
144    while num_pages > page:
145        page += 1
146
147        metadata, photos = parse_page(page, args.api_key, search_text,
148                                      min_date, max_date)
149
150        num_pages = int(metadata["pages"])
151
152        print 78 * "="
153        print "Page:\t\t", page, "of", num_pages
154        print "Min-Date:\t", datetime.datetime.fromtimestamp(min_date)
155        print "Max-Date:\t", datetime.datetime.fromtimestamp(max_date)
156        print "Num-Photos:\t", len(photos)
157        print 78 * "="
158
159        try:
160            pool.map_async(downloader, photos).get(1e10)
161        except KeyboardInterrupt:
162            pool.wait()
163            break
164
165        if page >= num_pages:
166            max_date -= TIME_SKIP
167            min_date -= TIME_SKIP
168            page = 0
169
170        if num_pages == 0:
171            days_in_row = days_in_row + 1
172            num_pages = float("inf")
173
174            print "    No images in", days_in_row, "days in a row"
175
176            if days_in_row == args.max_days_without_image:
177                break
178        else:
179            days_in_row = 0
180
181
182if __name__ == "__main__":
183    main()
184