1#!/usr/bin/env python
2from __future__ import division, print_function
3"""Retrieve a remote file via http to a local file.
4
5Note: at exit attempts to abort all outstanding transfers and delete the output files.
6
7To do: try polling while connected instead of using a progress callback.
8The advantages are:
9- fewer calls (except over a slow connection)
10- can implement a "stalled" time limit that only kicks in
11  if the given time elapses with no new bytes read.
12To do this you will have to get the full size from the token:
13self._tkApp.eval("set [set token](totalsize)")
14and should probably get the current size as well
15(rather than using the convenience function)
16because it's not clear the convenience function exists in older tcl.
17note: I doubt this tcl code can be executed with _tkApp.call --
18at least nothing I tried with call worked.
19
20Loosely based on RO.Wdg.FTPLogWdg.
21
22History:
232005-07-08 ROwen
242005-07-11 ROwen    Modified to call stateFunc less often during download.
252005-09-15 ROwen    Documented the arguments for callback functions.
262008-04-29 ROwen    Fixed reporting of exceptions that contain unicode arguments.
272010-05-26 ROwen    Tweaked to use _removeAllCallbacks() instead of nulling _callbacks.
282010-11-12 ROwen    Bug fix: timeLim was mishandled.
292011-06-16 ROwen    Ditched obsolete "except (SystemExit, KeyboardInterrupt): raise" code
302012-08-01 ROwen    Changed isDone()->isDone, getStateStr()->state, getErrMsg()->errMsg,
31                    getBytes()->readBytes, totBytes;
32                    Deleted getState() (use state, but returns a string, not an int);
33                    Added didFail, isAbortable.
34                    State constants are now strings, not integers.
352014-04-01 ROwen    Bug fix: "unknown state" message used an undefined variable.
362014-09-16 ROwen    Modified to use RO.AddCallback.safeCall2.
37                    Modified _cleanup to deregister tcl callbacks before cleaning up the connection.
38                    Modified to print warnings to stderr instead of stdout.
392014-09-18 ROwen    Fixed a bug in the unit test.
402015-09-24 ROwen    Replace "== None" with "is None" to modernize the code.
412015-11-03 ROwen    Replace "!= None" with "is not None" to modernize the code.
42"""
43__all__ = ['HTTPGet']
44
45import atexit
46import os
47import sys
48import time
49import Tkinter
50import RO.AddCallback
51import RO.StringUtil
52import RO.TkUtil
53
54_Debug = False
55_DebugExit = False
56
57_ProgressInterval = 0.1 # minimum time between progress callbacks (sec)
58
59class _ExitClass:
60    """Class to keep track of outstanding nework transfers
61    and abort them at exit.
62    """
63    def __init__(self, timeLim = 2.0, dtime = 0.1):
64        self.transferDict = {}
65        self.timeLim = timeLim
66        self.dtime = max(dtime, 0.01)
67        self.didRegisterExit = False
68
69    def addTransfer(self, httpGetter):
70        """Add one httpGetter.
71        """
72        if _DebugExit:
73            print("HTTPGet._Exit.addTransfer(%s)" % (httpGetter,))
74        httpGetter.addDoneCallback(self.removeTransfer)
75        self.transferDict[httpGetter] = None
76        if not self.didRegisterExit:
77            atexit.register(self.abortAll)
78            self.didRegisterExit = True
79
80    def removeTransfer(self, httpGetter):
81        """Remove one httpGetter.
82        Does not verify that the getter is finished.
83        """
84        if _DebugExit:
85            print("HTTPGet._Exit.removeTransfer(%s)" % (httpGetter,))
86        self.transferDict.pop(httpGetter)
87
88    def abortAll(self):
89        """Abort all outsanding transfers.
90        Meant to be registered with atexit.
91        """
92        if _DebugExit:
93            print("HTTPGet._Exit.abortAll()")
94        if not self.transferDict:
95            return
96
97        transferList = self.transferDict.keys()
98        for xfer in transferList:
99            if _DebugExit:
100                print("HTTGet._Exit: aborting %s" % (xfer,))
101            xfer.abort()
102
103        # wait a few seconds for all to end
104        maxWaits = self.timeLim / self.dtime
105        nWaits = 0
106        while self.transferDict:
107            time.sleep(self.dtime)
108            nWaits += 1
109            if nWaits > maxWaits:
110                if _DebugExit:
111                    print("HTTGet._Exit: timed out while waiting for aborts to finish")
112                break
113        else:
114            if _DebugExit:
115                print("HTTGet._Exit: all aborts finished")
116_ExitObj = _ExitClass()
117
118
119class HTTPGet(RO.AddCallback.BaseMixin):
120    """Downloads the specified url to a file.
121
122    Inputs:
123    - fromURL   url of file to download
124    - toPath    full path of destination file
125    - isBinary  file is binary? (if False, EOL translation is probably performed)
126    - overwrite if True, overwrites the destination file if it exists;
127                otherwise raises ValueError
128    - createDir if True, creates any required directories;
129                otherwise raises ValueError
130    - doneFunc  function to call when the transfer completes
131    - stateFunc function to call when state changes, including data received
132                (stateFunc will be called when the transfer ends)
133    - startNow  if True, the transfer is started immediately
134                otherwise callFunc is called and the transaction remains Queued
135                until start is called
136    - dispStr   a string to display while downloading the file;
137                if omitted, fromURL is displayed
138    - timeLim   time limit (sec) for the total transfer; if None then no limit
139
140    Callbacks receive one argument: this object.
141    """
142    # state constants
143    Queued = "Queued"
144    Connecting = "Connecting"
145    Running = "Running"
146    Aborting = "Aborting"
147    Done = "Done"
148    Aborted = "Aborted"
149    Failed = "Failed"
150
151    _AllStates = set((
152        Queued,
153        Connecting,
154        Running,
155        Aborting,
156        Done,
157        Aborted,
158        Failed,
159    ))
160    _AbortableStates = set((Queued, Connecting, Running))
161    _DoneStates = set((Done, Aborted, Failed))
162    _FailedStates = set((Aborted, Failed))
163
164    StateStrMaxLen = 0
165    for stateStr in _AllStates:
166        StateStrMaxLen = max(StateStrMaxLen, len(stateStr))
167    del(stateStr)
168    _tkApp = None
169
170    def __init__(self,
171        fromURL,
172        toPath,
173        isBinary = False,
174        overwrite = False,
175        createDir = True,
176        doneFunc = None,
177        stateFunc = None,
178        startNow = True,
179        dispStr = None,
180        timeLim = None,
181    ):
182        if self._tkApp is None:
183            self._tkApp = Tkinter.Frame().tk
184        self.fromURL = fromURL
185        self.toPath = toPath
186        self.isBinary = isBinary
187        self.overwrite = bool(overwrite)
188        self.createDir = createDir
189        if timeLim is not None:
190            self.timeLimMS = max(1, int(round(timeLim * 1000.0)))
191        else:
192            self.timeLimMS = 0
193
194        if dispStr is None:
195            self.dispStr = fromURL
196        else:
197            self.dispStr = dispStr
198
199        self._tclFile = None
200        self._tclHTTPConn = None
201        self._tclCallbacks = ()
202        self._tclHTTPDoneCallback = None
203        self._tclHTTPProgressCallback = None
204        self._lastProgReportTime = 0
205
206        self._readBytes = 0
207        self._lastReadBytes = 0
208        self._didPrintBlockSize = None # for debug output only
209        self._totBytes = None
210        self._state = self.Queued
211        self._errMsg = None
212
213        self._createdFile = False
214
215        self._tkApp.eval('package require http')
216
217        RO.AddCallback.BaseMixin.__init__(self, stateFunc, callNow=False)
218        self._doneCallbacks = []
219
220        global _ExitObj
221        _ExitObj.addTransfer(self)
222
223        if doneFunc:
224            self.addDoneCallback(doneFunc)
225
226        if startNow:
227            self.start()
228
229    def addDoneCallback(self, func):
230        """Add a function that will be called when the transfer completes"""
231        self._doneCallbacks.append(func)
232
233    def removeDoneCallback(self, func):
234        """Remove a done callback.
235        """
236        self._doneCallbacks.remove(func)
237
238    def start(self):
239        """Start the download.
240
241        If state is not Queued, raises RuntimeError
242        """
243        if _Debug:
244            print("%s.start()" % (self,))
245        if self._state != self.Queued:
246            raise RuntimeError("Cannot start; state = %r not Queued" % (self._state,))
247
248        self._setState(self.Connecting)
249
250        try:
251            # verify output file and verify/create output directory, as appropriate
252            self._toPrep()
253
254            # open output file
255            if _Debug:
256                print("HTTPGet: opening output file %r" % (self.toPath,))
257            try:
258                self._tclFile = self._tkApp.call('open', self.toPath, "w")
259                self._createdFile = True
260                if self.isBinary:
261                    self._tkApp.call('fconfigure', self._tclFile, "-encoding", "binary", "-translation", "binary")
262            except Tkinter.TclError as e:
263                raise RuntimeError("Could not open %r: %s" % (self.toPath, e))
264
265            # start http transfer
266            doneCallback = RO.TkUtil.TclFunc(self._httpDoneCallback, debug=_Debug)
267            progressCallback = RO.TkUtil.TclFunc(self._httpProgressCallback, debug=_Debug)
268            self._tclCallbacks = (doneCallback, progressCallback)
269            if _Debug:
270                print("HTTPGet: creating http connection")
271            self._tclHTTPConn = self._tkApp.call(
272                '::http::geturl', self.fromURL,
273                '-channel', self._tclFile,
274                '-command', doneCallback,
275                '-progress', progressCallback,
276                '-binary', self.isBinary,
277                '-timeout', self.timeLimMS
278            )
279        except Exception as e:
280            self._setState(self.Failed, RO.StringUtil.strFromException(e))
281            return
282
283        self._setState(self.Running)
284
285    def abort(self):
286        """Start aborting: cancel the transaction and delete the output file.
287        Silently fails if the transaction has already completed
288        """
289        if _Debug:
290            print("%s.abort()" % (self,))
291        if self.isDone:
292            return
293        elif self._state == self.Queued:
294            self._setState(self.Aborted)
295            return
296
297        if self._tclHTTPConn is None:
298            sys.stderr.write("HTTPGet cannot abort: isDone false but no http connection\n")
299            return
300
301        self._setState(self.Aborting)
302        self._tkApp.call("::http::reset", self._tclHTTPConn)
303        if _Debug:
304            print("http connection reset")
305
306    @property
307    def errMsg(self):
308        """If the transfer failed, an explanation as a string, else None
309        """
310        return self._errMsg
311
312    @property
313    def state(self):
314        """Returns the current state as a string.
315        """
316        return self._state
317
318    @property
319    def isAbortable(self):
320        """True if the transaction can be aborted
321        """
322        return self._state in self._AbortableStates
323
324    @property
325    def isDone(self):
326        """Return True if the transaction is finished (succeeded, aborted or failed), False otherwise.
327        """
328        return self._state in self._DoneStates
329
330    @property
331    def didFail(self):
332        """Return True if the transaction failed or was aborted
333        """
334        return self._state in self._FailedStates
335
336    @property
337    def readBytes(self):
338        """Bytes read so far
339        """
340        return self._readBytes
341
342    @property
343    def totBytes(self):
344        """Total bytes in file, if known, None otherwise.
345
346        The value is certain to be unknown until the transfer starts;
347        after that it depends on whether the server sends the info.
348        """
349        return self._totBytes
350
351    def _setState(self, newState, errMsg=None):
352        """Set a new state and call callbacks.
353        Do nothing if already done.
354        errMsg is ignored unless newState is Failed.
355
356        Raise RuntimeError if newState unknown.
357        """
358        if _Debug:
359            print("%s._setState(newState=%s, errmsg=%r)" % (self, newState, errMsg))
360        # if state is not valid, reject
361        if self.isDone:
362            return
363
364        if newState not in self._AllStates:
365            raise RuntimeError("Unknown state %r" % (newState,))
366
367        self._state = newState
368        if newState == self.Failed:
369            self._errMsg = errMsg
370
371        isDone = self.isDone
372        if isDone:
373            self._cleanup()
374
375        self._doCallbacks()
376        if isDone:
377            # call done callbacks
378            # use a copy in case a callback deregisters itself
379            for func in self._doneCallbacks[:]:
380                RO.AddCallback.safeCall2(str(self), func, self)
381
382            # remove all callbacks
383            self._removeAllCallbacks()
384            self._doneCallbacks = []
385            self._tclCallbacks = ()
386
387    def _cleanup(self):
388        """Clean up everything except references to callbacks.
389
390        Warning: this is a private method: call only from _setState!
391
392        Close the input and output files and deregister the tcl callbacks.
393        If state in (Aborted, Failed), delete the output file.
394        """
395        if _Debug:
396            print("%s._cleanup()")
397        for tclFunc in self._tclCallbacks:
398            if _Debug:
399                print("deregister %s" % (tclFunc,))
400            tclFunc.deregister()
401        if self._tclHTTPConn is not None:
402            self._tkApp.call("::http::cleanup", self._tclHTTPConn)
403            self._tclHTTPConn = None
404            if _Debug:
405                print("http connection cleaned up")
406        if self._tclFile:
407            self._tkApp.call("close", self._tclFile)
408            self._tclFile = None
409            if _Debug:
410                print("output file closed")
411
412        if self._createdFile and self._state in (self.Aborted, self.Failed):
413            try:
414                os.remove(self.toPath)
415                if _Debug:
416                    print("deleted output file")
417            except OSError as e:
418                if _Debug:
419                    print("failed to delete output file: %s" % (e,))
420
421    def _httpDoneCallback(self, token=None):
422        """Called when the http transfer is finished.
423        """
424        if self._tclHTTPConn is None:
425            sys.stderr.write("HTTPGet warning: _httpDoneCallback called but no http connection\n")
426            return
427
428        if _Debug:
429            print("%s.httpDoneCallback()" % (self,))
430            print("status=%r" % (self._tkApp.call('::http::status', self._tclHTTPConn),))
431            print("code=%r" % (self._tkApp.call('::http::code', self._tclHTTPConn),))
432            print("ncode=%r" % (self._tkApp.call('::http::ncode', self._tclHTTPConn),))
433            print("error=%r" % (self._tkApp.call('::http::error', self._tclHTTPConn),))
434
435        httpState = self._tkApp.call('::http::status', self._tclHTTPConn)
436        errMsg = None
437        if httpState == "ok":
438            codeNum = int(self._tkApp.call('::http::ncode', self._tclHTTPConn))
439            if codeNum == 200:
440                newState = self.Done
441            else:
442                if _Debug:
443                    print("status ok but code=%s not 200" % (codeNum,))
444                newState = self.Failed
445                errMsg = self._tkApp.call('::http::code', self._tclHTTPConn)
446        elif httpState == "eof":
447            newState = self.Failed
448            errMsg = "No reply from http server"
449        elif httpState == "timeout":
450            newState = self.Failed
451            errMsg = "Timed out"
452        elif httpState == "reset":
453            newState = self.Aborted
454        else:
455            if httpState != "error":
456                sys.stderr.write("HTTPGet warning: unknown httpState=%s; assuming error\n" % (httpState,))
457            newState = self.Failed
458            errMsg = self._tkApp.call('::http::error', self._tclHTTPConn)
459            if not errMsg:
460                errMsg = httpState
461
462        self._setState(newState, errMsg)
463
464    def _httpProgressCallback(self, token, totBytes, readBytes):
465        """http callback function.
466        """
467        if _Debug:
468            print("%s._httpProgressCallback(totBytes=%r, readBytes=%r)" % (self, totBytes, readBytes))
469
470        self._totBytes = int(totBytes)
471        self._lastReadBytes = self._readBytes
472        self._readBytes = int(readBytes)
473        if _Debug and not self._didPrintBlockSize and self._readBytes:
474            print("%s block size=%s" % (self, self._readBytes - self._lastReadBytes))
475            self._didPrintBlockSize = True
476
477        newTime = time.time()
478        if (newTime - self._lastProgReportTime) > _ProgressInterval:
479            self._doCallbacks()
480            self._lastProgReportTime = newTime
481
482    def __str__(self):
483        return "%s(%s)" % (self.__class__.__name__, self.fromURL)
484
485    def _toPrep(self):
486        """Create or verify the existence of the output directory
487        and check if output file already exists.
488
489        Raise RuntimeError or IOError if anything is wrong.
490        """
491        if _Debug:
492            print("%s._toPrep()" % (self,))
493        # if output file exists and not overwrite, complain
494        if not self.overwrite and os.path.exists(self.toPath):
495            raise RuntimeError("toPath %r already exists" % (self.toPath,))
496
497        # if directory does not exist, create it or fail, depending on createDir;
498        # else if "directory" exists but is a file, fail
499        toDir = os.path.dirname(self.toPath)
500        if toDir:
501            if not os.path.exists(toDir):
502                # create the directory or fail, depending on createDir
503                if self.createDir:
504                    if _Debug:
505                        print("%s._toPrep creating directory %r" % (self, toDir))
506                    os.makedirs(toDir)
507                else:
508                    raise RuntimeError("directory %r does not exist" % (toDir,))
509            elif not os.path.isdir(toDir):
510                raise RuntimeError("%r is a file, not a directory" % (toDir,))
511
512if __name__ == "__main__":
513    root = Tkinter.Tk()
514
515    testURL = "http://www.astro.washington.edu/"
516    outFile = "httpget_test.html"
517
518    _Debug = False
519    _DebugExit = True
520
521    def stateCallback(httpObj):
522        print("state =", httpObj.state, end=' ')
523        print("read %s of %s bytes" % (httpObj.readBytes, httpObj.totBytes))
524        if httpObj.isDone:
525            if httpObj.errMsg:
526                print("error message =", httpObj.errMsg)
527            root.quit()
528
529    httpObj = HTTPGet(
530        fromURL = testURL,
531        toPath = outFile,
532        isBinary = False,
533        stateFunc = stateCallback,
534        startNow = True,
535        overwrite = True,
536    )
537
538    root.mainloop()
539