1from __future__ import print_function
2import copy
3import os
4from itertools import chain
5from . import dbdict
6import operator
7import marshal
8import hashlib
9import multiprocessing.managers
10from collections import defaultdict
11from .ruffus_exceptions import *
12from functools import reduce
13import glob
14import types
15import sys
16import re
17if sys.hexversion < 0x03000000:
18    from future_builtins import zip
19################################################################################
20#
21#   ruffus_utility.py
22#
23#
24#   Copyright (c) 10/9/2009 Leo Goodstadt
25#
26#   Permission is hereby granted, free of charge, to any person obtaining a copy
27#   of this software and associated documentation files (the "Software"), to deal
28#   in the Software without restriction, including without limitation the rights
29#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
30#   copies of the Software, and to permit persons to whom the Software is
31#   furnished to do so, subject to the following conditions:
32#
33#   The above copyright notice and this permission notice shall be included in
34#   all copies or substantial portions of the Software.
35#
36#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
37#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
38#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
39#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
40#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
41#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
42#   THE SOFTWARE.
43#################################################################################
44
45
46"""
47
48********************************************
49:mod:`ruffus_utility` -- Overview
50********************************************
51
52
53.. moduleauthor:: Leo Goodstadt <ruffus@llew.org.uk>
54
55    Common utility functions
56
57
58"""
59
60
61# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
62
63#   imports
64
65
66# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
67#import task
68try:
69    from collections.abc import Callable
70except ImportError:
71    from collections import Callable
72try:
73    import cPickle as pickle
74except:
75    import pickle as pickle
76if sys.hexversion >= 0x03000000:
77    # everything is unicode in python3
78    path_str_type = str
79else:
80    path_str_type = basestring
81
82# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
83
84#   Constants
85
86
87# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
88
89#
90# file to store history out to
91#
92RUFFUS_HISTORY_FILE = '.ruffus_history.sqlite'
93# If DEFAULT_RUFFUS_HISTORY_FILE is specified in the environment variables, use that instead
94if "DEFAULT_RUFFUS_HISTORY_FILE" in os.environ:
95    RUFFUS_HISTORY_FILE = os.environ["DEFAULT_RUFFUS_HISTORY_FILE"]
96
97
98# only rerun when the file timestamps are out of date (classic mode)
99CHECKSUM_FILE_TIMESTAMPS = 0
100# also rerun when the history shows a job as being out of date
101CHECKSUM_HISTORY_TIMESTAMPS = 1
102CHECKSUM_FUNCTIONS = 2     # also rerun when function body has changed
103# also rerun when function parameters or function body change
104CHECKSUM_FUNCTIONS_AND_PARAMS = 3
105
106CHECKSUM_REGENERATE = 2     # regenerate checksums
107
108
109# number of times to check if an input file exists
110FILE_CHECK_RETRY = 5
111# number of seconds to sleep before retrying a file check
112FILE_CHECK_SLEEP = 10
113# _________________________________________________________________________________________
114
115#   t_extra_inputs
116#       namespaced enum
117
118# _________________________________________________________________________________________
119
120
121class t_extra_inputs:
122    (ADD_TO_INPUTS, REPLACE_INPUTS, KEEP_INPUTS, KEEP_OUTPUTS) = list(range(4))
123
124
125class inputs(object):
126    def __init__(self, *args):
127        self.args = args
128
129    def __repr__(self, *args):
130        return 'inputs%r' % (self.args,)
131
132
133class add_inputs(object):
134    def __init__(self, *args):
135        self.args = args
136
137    def __repr__(self, *args):
138        return 'add_inputs%r' % (self.args,)
139
140
141def get_default_checksum_level():
142    """
143    Use the checksum level from the environmental variable DEFAULT_RUFFUS_CHECKSUM_LEVEL
144    Otherwise default to CHECKSUM_HISTORY_TIMESTAMPS
145    """
146
147    #
148    #   environmental variable not set
149    #
150    if "DEFAULT_RUFFUS_CHECKSUM_LEVEL" not in os.environ:
151        return CHECKSUM_HISTORY_TIMESTAMPS
152
153    #
154    # lookup value from list of CHECKSUM_XXX constants
155    #
156    checksum_level = None
157    env_checksum_level = os.environ["DEFAULT_RUFFUS_CHECKSUM_LEVEL"]
158    if len(env_checksum_level) == 1 and env_checksum_level in "0123":
159        checksum_level = int(env_checksum_level)
160    else:
161        global_var = globals()
162        for key in global_var:
163            if key.startswith('CHECKSUM') and global_var[key] == env_checksum_level:
164                checksum_level = value
165
166    #
167    #   check environmental variable is valid string
168    #
169    if checksum_level is None:
170        raise error_checksum_level(("The environmental value "
171                                    "DEFAULT_RUFFUS_CHECKSUM_LEVEL should be: [0-3 | "
172                                    "CHECKSUM_FILE_TIMESTAMPS | "
173                                    "CHECKSUM_HISTORY_TIMESTAMPS | "
174                                    "CHECKSUM_FUNCTIONS | "
175                                    "CHECKSUM_FUNCTIONS_AND_PARAMS] (rather than '%s') ")
176                                   % (env_checksum_level,))
177
178    return checksum_level
179
180
181# _________________________________________________________________________________________
182
183#   open_job_history
184
185# _________________________________________________________________________________________
186def get_default_history_file_name():
187    history_file = RUFFUS_HISTORY_FILE
188    #
189    #   try path expansion using the main script name
190    #
191    try:
192        import __main__ as main
193        path_parts = path_decomposition(os.path.abspath(main.__file__))
194        history_file = history_file.format(**path_parts)
195    except Exception:
196        pass
197    return history_file
198
199
200def open_job_history(history_file):
201    """
202    Given a history file name, opens the correspond sqllite db file and returns the handle
203    """
204    if not history_file:
205        history_file = get_default_history_file_name()
206
207    return dbdict.open(history_file, picklevalues=True)
208
209
210# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
211
212#   Functions
213
214
215# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
216
217class JobHistoryChecksum:
218    """Class to remember exactly how an output file was created and when."""
219
220    def __str__(self):
221        from time import strftime, gmtime
222        if hasattr(self, "params"):
223            return str([self.outfile,
224                        strftime("%d %b %Y %H:%M:%S", gmtime(self.mtime)),
225                        self.params,
226                        self.task_name
227                        ])
228        else:
229            return strftime("%d %b %Y %H:%M:%S", gmtime(self.mtime))
230
231    def __init__(self, outfile, mtime, params, task):
232        # filename and modification time
233        self.outfile = outfile
234        self.mtime = mtime
235
236        # Uncomment next two lines to debug:
237        #self.params = params
238        #self.task_name = task._name
239
240        # checksum exact params used to generate this output file
241        self.chksum_params = hashlib.md5(pickle.dumps(params)).hexdigest()
242        # checksum the function bytecode as well as the function context
243        # Don't use func_code alone-- changing the line number of the function,
244        # what global variables are available, etc would all change the checksum
245        if sys.hexversion >= 0x03000000:
246            code = task.user_defined_work_func.__code__
247            func_defaults = task.user_defined_work_func.__defaults__
248        else:
249            code = task.user_defined_work_func.func_code
250            func_defaults = task.user_defined_work_func.func_defaults
251        func_code = marshal.dumps(code.co_code)
252
253        #
254        #   pickle code very defensively, but hopefully without breaking Jake Biesinger's pipelines!
255        #
256        attributes_to_pickle = [func_defaults,
257                                code.co_argcount,
258                                code.co_consts,
259                                code.co_names,
260                                code.co_nlocals,
261                                code.co_varnames]
262
263        pickle_results = []
264        for aa in attributes_to_pickle:
265            # Can't cpickle nested functions: typically blows up with func_code.co_consts
266            try:
267                pickle_results.append(pickle.dumps(aa))
268                continue
269            except:
270                pass
271            # Marshal seems to be less sensitive: try that
272            try:
273                pickle_results.append(marshal.dumps(aa))
274                continue
275            except:
276                pass
277            # Just make a string out of the attribute
278            try:
279                pickle_results.append(str(aa))
280                continue
281            except:
282                pass
283            # OK give up, do nothing: On your head it is
284
285        func_extras = reduce(operator.add, pickle_results)
286        self.chksum_func = hashlib.md5(func_code + func_extras).hexdigest()
287
288
289# _________________________________________________________________________________________
290#
291#   parameter_list_as_string
292#
293# _________________________________________________________________________________________
294def parameter_list_as_string(parameters):
295    """
296    Input list of parameters
297       Turn this into a string for display
298
299        E.g.
300
301    """
302    if parameters is None:
303        return ""
304    elif not isinstance(parameters, list):
305        raise Exception("Unexpected parameter list %s" % (parameters,))
306    else:
307        return str(parameters)[1:-1]
308
309# _________________________________________________________________________________________
310#
311#   path_decomposition
312#
313# _________________________________________________________________________________________
314
315
316def path_decomposition(orig_path):
317    """
318    returns a dictionary identifying the components of a file path:
319        This has the following keys
320            basename: (any) base (file) name of the path not including the extension. No slash included
321            ext:      (any) extension of the path including the "."
322            path:     a list of subpaths created by removing subdirectory names
323            subdir:   a list of subdirectory names from the most nested to the root
324        For example
325            apath = "/a/b/c/d/filename.txt"
326            {   'basename': 'filename',
327                'ext':      '.txt'
328                'path':     ['/a/b/c/d', '/a/b/c', '/a/b', '/a', '/'], ,
329                'subdir': ['d', 'c', 'b', 'a', '/']
330            }
331            "{path[2]}/changed/{subdir[0]}".format(**res) = '/a/b/changed/d'
332            "{path[3]}/changed/{subdir[1]}".format(**res) = '/a/changed/c'
333    """
334    def recursive_split(a_path):
335        """
336        split the path into its subdirectories recursively
337        """
338        if not len(a_path):
339            return [[], []]
340        if a_path == "/" or a_path == "//":
341            return [[a_path], [a_path]]
342        sub_path_part, sub_dir_part = os.path.split(a_path)
343        if sub_dir_part:
344            sub_path_parts, sub_dir_parts = recursive_split(sub_path_part)
345            return [[a_path] + sub_path_parts,
346                    [sub_dir_part] + sub_dir_parts]
347        else:
348            return [[], ["/"]]
349    #
350    if not len(orig_path):
351        return {'path': [], 'basename': '', 'ext': '', 'subdir': []}
352
353    # stop normpath from being too clever and removing initial ./ and terminal slash, turning paths into filenames
354    if orig_path in ["./", "/."]:
355        a_path = orig_path
356    else:
357        a_path = os.path.normpath(orig_path)
358        if orig_path[0:2] == "./" and a_path[0:2] != "./":
359            a_path = "./" + a_path
360
361        if orig_path[-1] == "/" and a_path[-1:] != "/":
362            a_path += "/"
363
364    path_part, file_part = os.path.split(a_path)
365    file_part, ext_part = os.path.splitext(file_part)
366    subpaths, subdirs = recursive_split(path_part)
367    return {'basename': file_part,
368            'ext':      ext_part,
369            'subpath':  subpaths,
370            'subdir':   subdirs,
371            'path':     path_part}
372
373
374# _________________________________________________________________________________________
375#
376#   get_nth_nested_level_of_path
377#
378# _________________________________________________________________________________________
379def get_nth_nested_level_of_path(orig_path, n_levels):
380    """
381    Return path with up to N levels of subdirectories
382    0 = full path
383    N = 1 : basename
384    N = 2 : basename + one subdirectory
385
386    For example
387        0   /test/this/now/or/not.txt
388        1   not.txt
389        2   or/not.txt
390        3   now/or/not.txt
391        4   this/now/or/not.txt
392        5   test/this/now/or/not.txt
393        6   /test/this/now/or/not.txt
394        7   /test/this/now/or/not.txt
395    """
396    # FIXME: consider returning full path to make debugging easier or at least
397    # make it optional
398    if not n_levels or n_levels < 0:
399        return orig_path
400    res = path_decomposition(orig_path)
401    basename = res["basename"] + res["ext"]
402    shortened_path = os.path.join(
403        *(list(reversed(res["subdir"][0:(n_levels - 1)]))+[basename]))
404    if len(shortened_path) < len(orig_path):
405        return ".../" + shortened_path
406
407
408# _________________________________________________________________________________________
409#
410#   swap_nesting_order
411#
412# _________________________________________________________________________________________
413def swap_nesting_order(orig_coll):
414    """
415    Reverse nested order so that coll[3]['a'] becomes coll['a'][3]
416    """
417    new_dict = defaultdict(dict)
418    new_list = []
419    for ii, ii_item in enumerate(orig_coll):
420        for jj, value in ii_item.items():
421            if isinstance(jj, int):
422                # resize
423                new_list += [{}]*(jj + 1 - len(new_list))
424                new_list[jj][ii] = value
425            else:
426                new_dict[jj][ii] = value
427    return new_list, dict(new_dict)
428
429# _________________________________________________________________________________________
430#
431#   swap_doubly_nested_order
432#
433# _________________________________________________________________________________________
434
435
436def swap_doubly_nested_order(orig_coll):
437    """
438    Reverse nested order so that coll[3]['a'] becomes coll['a'][3]
439    """
440    new_dict = dict()
441    new_list = []
442    for ii, ii_item in enumerate(orig_coll):
443        for jj, jj_item in enumerate(ii_item):
444            for kk, value in jj_item.items():
445                if isinstance(kk, int):
446                    # resize
447                    new_list += [{}]*(kk + 1 - len(new_list))
448                    if ii not in new_list[kk]:
449                        new_list[kk][ii] = dict()
450                    new_list[kk][ii][jj] = value
451                else:
452                    if kk not in new_dict:
453                        new_dict[kk] = dict()
454                    if ii not in new_dict[kk]:
455                        new_dict[kk][ii] = dict()
456                    new_dict[kk][ii][jj] = value
457
458    return new_list, new_dict
459
460
461# _________________________________________________________________________________________
462#
463#   regex_matches_as_dict
464#
465# _________________________________________________________________________________________
466def regex_matches_as_dict(test_str, compiled_regex):
467    """
468    Returns result of regular expression match in a dictionary
469        combining both named and unnamed captures
470    """
471    if compiled_regex:
472        if isinstance(compiled_regex, path_str_type):
473            compiled_regex = re.compile(compiled_regex)
474        mm = compiled_regex.search(test_str)
475        # Match failed
476        if mm is None:
477            return False
478        else:
479            # No capture
480            if mm.lastindex is None:
481                return {0: mm.group(0)}
482            # Combined named and unnamed captures
483            else:
484                # no dictionary comprehensions in python 2.6 :-(
485                #matchdicts.append({i : mm.group(i) for i in (range(mm.lastindex) + mm.groupdict().keys())})
486                #   Keys for captures:
487                #       1) unnamed captures = range(mm.lastindex + 1)
488                #       2) named captures   = mm.groupdict().keys()
489                return dict((i, mm.group(i)) for i in (chain(iter(range(mm.lastindex + 1)),
490                                                             iter(mm.groupdict().keys()))))
491
492    else:
493        return None
494
495
496# _________________________________________________________________________________________
497#
498#   path_decomposition_regex_match
499#
500# _________________________________________________________________________________________
501def path_decomposition_regex_match(test_str, compiled_regex):
502    """
503    Returns a dictionary identifying the components of a file path.
504
505    This includes both the components of a path:
506        basename: (any) base (file) name of the path not including the extension. No slash included
507        ext:      (any) extension of the path including the "."
508        path:     a list of subpaths created by removing subdirectory names
509        subdir:   a list of subdirectory names from the most nested to the root
510    and regular expression matches
511        The keys are the index or name of the capturing group.
512
513
514    If compiled_regexes is not specified, return path decomposition only
515
516    If compiled_regexes is specified, and the corresponding regular expression does not match,
517        the entire match fails
518
519    For example
520
521        path_decomposition_regex_match("/a/b/c/sample1.bam", r"(.*)(?P<id>\d+)\..+")
522
523            {
524                0:          '/a/b/c/sample1.bam',           // captured by index
525                1:          '/a/b/c/sample',                // captured by index
526                'id':       '1'                             // captured by name
527                'ext':      '.bam',
528                'subdir':   ['c', 'b', 'a', '/'],
529                'subpath':  ['/a/b/c', '/a/b', '/a', '/'],
530                'path':     '/a/b/c',
531                'basename': 'sample1',
532            },
533
534        path_decomposition_regex_match("dbsnp15.vcf", r"(.*)(?P<id>\d+)\..+")
535            {
536                0: 'dbsnp15.vcf',                           // captured by index
537                1: 'dbsnp1',                                // captured by index
538                'id': '5'                                   // captured by name
539                'ext': '.vcf',
540                'subdir': [],
541                'path': [],
542                'basename': 'dbsnp15',
543            },
544
545
546        // fail
547        path_decomposition_regex_match("/test.txt", r"(.*)(?P<id>\d+)\..+")
548            {}
549
550        // path components only
551        path_decomposition_regex_match("/test.txt", None)
552            {
553                'ext': '.txt',
554                'subdir': ['/']
555                'subpath': ['/'],
556                'path': '/',
557                'basename': 'test',
558            }
559
560    """
561    pp = path_decomposition(test_str)
562
563    # regular expression not specified
564    # just path
565    if compiled_regex is None:
566        return pp
567
568    rr = regex_matches_as_dict(test_str, compiled_regex)
569
570    # regular expression match failed
571    # nothing
572    if rr == False:
573        return {}
574
575    #
576    #   regular expression matches override file decomposition values in
577    #       case of clashes between predefined keys such as "basename" and
578    #       regular expression named capture groups
579    #
580    pp.update(rr)
581    return pp
582
583
584# _________________________________________________________________________________________
585#
586#   check_compiled_regexes
587#
588# _________________________________________________________________________________________
589def check_compiled_regexes(compiled_regexes, expected_num):
590    """
591    check compiled_regexes are of the right type and number
592    """
593    if compiled_regexes is None:
594        return [None] * expected_num
595
596    if not isinstance(compiled_regexes, list):
597        raise Exception("Expecting list of None and strings")
598
599    #   pad compiled_regexes with None
600    if len(compiled_regexes) < expected_num:
601        compiled_regexes.extend(
602            [None] * (expected_num - len(compiled_regexes)))
603
604    #   Turn strings to regular expression just in case
605    #   We don't want to do this here because the error messages are not very nice:
606    #   There is not much context left
607    compiled_regexes = [re.compile(rr) if isinstance(
608        rr, path_str_type) else rr for rr in compiled_regexes]
609
610    #   check types
611    regex_types = type(re.compile("")), type(None)
612    for rr in compiled_regexes:
613        if not isinstance(rr, regex_types):
614            raise Exception(
615                "Unexpected type %s ('%s') specified in regular expression list. Expecting string or compiled regular expression" % (type(rr), rr))
616
617    return compiled_regexes
618
619
620# _________________________________________________________________________________________
621#
622#   get_all_paths_components
623#
624# _________________________________________________________________________________________
625def get_all_paths_components(paths, compiled_regexes):
626    """
627        For each path in a list,
628            If any of the regular expression matches fails, the whole list fails
629    """
630    #
631    #   merge regular expression matches and path decomposition
632    #
633    compiled_regexes = check_compiled_regexes(compiled_regexes, len(paths))
634    results = []
635    for (pp, rr) in zip(paths, compiled_regexes):
636        result = path_decomposition_regex_match(pp, rr)
637        if result == {}:
638            return [{}] * len(paths)
639        results.append(result)
640    return results
641
642
643# _________________________________________________________________________________________
644#
645#   apply_func_to_sequence
646#
647# _________________________________________________________________________________________
648def apply_func_to_sequence(seq, func, tuple_of_conforming_types=(path_str_type,), tuple_of_sequences_types=(list, tuple, set)):
649    """
650    Recurses into list/tuple/set sequences to apply func to conforming types
651    Non-conforming types are left alone
652    """
653    if isinstance(seq, tuple_of_conforming_types):
654        return func(seq)
655    elif isinstance(seq, tuple_of_sequences_types):
656        return type(seq)(apply_func_to_sequence(pp, func, tuple_of_conforming_types, tuple_of_sequences_types) for pp in seq)
657    else:
658        return seq
659
660
661# _________________________________________________________________________________________
662#
663#   t_regex_replace
664#
665# _________________________________________________________________________________________
666class t_regex_replace(object):
667    def __init__(self, filename, regex_str, compiled_regex, regex_or_suffix):
668        self.regex_or_suffix = regex_or_suffix
669        self.compiled_regex = compiled_regex
670        self.regex_str = regex_str
671        self.filename = filename
672
673    def __call__(self, p):
674        #
675        #   check if substitution pattern is mis-specified
676        #
677        if "\1"in p or "\2" in p:
678            raise error_unescaped_regular_expression_forms("['%s'] " % (p.replace("\1", r"\1").replace("\2", r"\2")) +
679                                                           "The special regular expression characters "
680                                                           r"\1 and \2 need to be 'escaped' in python. "
681                                                           r"The easiest option is to use python 'raw' strings "
682                                                           r"e.g. r'\1_in_a string\2'. See http://docs.python.org/library/re.html.")
683        #
684        #   For suffix(), replaces the suffix part by adding leading r"\1" to the substitution pattern
685        #
686        #   If r"\1" is specified, then we presume you know what you are doing...
687        #
688        if self.regex_or_suffix == SUFFIX_SUBSTITUTE:
689            if r"\1" not in p and r"\g<1>" not in p:
690                match_p = r"\g<1>" + p
691            else:
692                match_p = p
693
694            # throw exception if doesn't match regular expression at all
695            (res_str, cnt_replacements) = self.compiled_regex.subn(
696                match_p, self.filename)
697            if cnt_replacements == 0:
698                raise error_input_file_does_not_match(
699                    "File '%s' does not match suffix('%s') and pattern '%s'" % (self.filename, self.regex_str, p))
700            return res_str
701
702        #
703        #   Normal substitution
704        #
705        #   For suffix(), complete replacement by the specified pattern text
706        #           only substitute if r"\1" or r"\g<1>" is specified
707        #
708        #
709        err_str = ""
710        try:
711            (res_str, cnt_replacements) = self.compiled_regex.subn(p, self.filename)
712            if cnt_replacements > 0:
713                return res_str
714        except re.error:
715            exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
716            err_str = str(exceptionValue)
717            raise fatal_error_input_file_does_not_match(
718                "File '%s' does not match regex('%s') and pattern '%s':\n\t%s\n" % (self.filename, self.regex_str, p, err_str))
719        except IndexError:
720            exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
721            err_str = str(exceptionValue)
722            raise fatal_error_input_file_does_not_match(
723                "File '%s' does not match regex('%s') and pattern '%s':\n\t%s\n" % (self.filename, self.regex_str, p, err_str))
724
725        # except (re.error, IndexError):
726            #err_str = str(sys.exc_info()[1]),
727
728        raise error_input_file_does_not_match("File '%s' does not match regex('%s') and pattern '%s'\n%s\n" % (
729            self.filename, self.regex_str, p, err_str))
730
731
732# _________________________________________________________________________________________
733#
734#   raise_formatter_substitution_exception
735#
736# _________________________________________________________________________________________
737def raise_formatter_substitution_exception(exceptionValue, formatter_str, pattern, filenames,
738                                           substitutes_list, substitutes_dict):
739    """
740    Throws an exception when formatter fails to make a substitution
741    """
742    # convert to string to get just the missing key
743    missing_key = str(exceptionValue.args[0])
744    # strip quotes
745    if missing_key[0:1] in '\'"' and missing_key[-1:] in '\'"':
746        missing_key = missing_key[1:-1]
747    raise error_input_file_does_not_match("Unmatched field {%s} in '%s' where\n  input =  %r,\n"
748                                          "  filter = formatter(%s). Possible substitutions= %s, %s."
749                                          % (missing_key,
750                                             pattern,
751                                             filenames,
752                                             formatter_str,
753                                             substitutes_list, substitutes_dict))
754
755
756# _________________________________________________________________________________________
757#
758#   t_formatter_replace
759#
760# _________________________________________________________________________________________
761class t_formatter_replace(object):
762    def __init__(self, filenames, regex_strings, compiled_regexes=None):
763        self.filenames = filenames
764        # get the full absolute, normalised paths
765        filenames = [os.path.abspath(f) for f in filenames]
766        self.path_regex_components = get_all_paths_components(
767            filenames, compiled_regexes)
768        self.display_regex_strings = parameter_list_as_string(regex_strings)
769
770    def __call__(self, pattern):
771        # swapped nesting order makes the syntax easier to explain:
772        #   The first level of indirection is always the path component
773        #   So basename[0] is the file name for the first file
774        #   This looks better than the normal 0[basename]
775
776        # some contortions because format decodes {0} as an offset into a list and not not a lookup into a dict...
777        substitutes_list, substitutes_dict = swap_nesting_order(
778            self.path_regex_components)
779
780        try:
781            return pattern.format(*substitutes_list, **substitutes_dict)
782        except (KeyError, IndexError):
783            raise_formatter_substitution_exception(sys.exc_info()[1], self.display_regex_strings,
784                                                   pattern, self.filenames,
785                                                   substitutes_list, substitutes_dict)
786
787
788# _________________________________________________________________________________________
789#
790#   t_nested_formatter_replace
791#
792# _________________________________________________________________________________________
793class t_nested_formatter_replace(object):
794    """
795    Like  t_formatter_replace but with one additional level of nesting
796    I.e. everything is a list comprehension!
797    For combinatorics @decorators
798    """
799
800    def __init__(self, filenames, regex_strings, compiled_regexes):
801        # make sure that we have the same level of nestedness for regular expressions and file names etc.
802        if len(filenames) != len(regex_strings) or len(filenames) != len(compiled_regexes):
803            raise Exception("Logic Error.")
804        self.filenames = filenames
805        # get the full absolute, normalised paths
806        filenames = [[os.path.abspath(f) for f in filegroups]
807                     for filegroups in filenames]
808        self.path_regex_components = [get_all_paths_components(
809            f, r) for (f, r) in zip(filenames, compiled_regexes)]
810        self.display_regex_strs = [
811            parameter_list_as_string(ss) for ss in regex_strings]
812
813    def __call__(self, pattern):
814        # swapped nesting order makes the syntax easier to explain:
815        #   The first level of indirection is always the path component
816        #   So basename[0] is the file name for the first file
817        #   This looks better than the normal 0[basename]
818
819        # some contortions because format decodes {0} as an offset into a list and not not a lookup into a dict...
820        substitutes_list, substitutes_dict = swap_doubly_nested_order(
821            self.path_regex_components)
822        try:
823            return pattern.format(*substitutes_list, **substitutes_dict)
824        except (KeyError, IndexError):
825            formatter_str = ", ".join("formatter(%s)" %
826                                      ss for ss in self.display_regex_strs)
827            raise_formatter_substitution_exception(sys.exc_info()[1], formatter_str, pattern, self.filenames,
828                                                   substitutes_list, substitutes_dict)
829
830# _________________________________________________________________________________________
831#
832#   t_nested_string_replace
833#
834# _________________________________________________________________________________________
835
836
837class t_nested_string_replace(object):
838    """
839    Replaces path with directory
840    """
841
842    def __init__(self, prev_str, new_str):
843        self.prev_str = prev_str
844        self.new_str = new_str
845
846    def __call__(self, p):
847        return p.replace(prev_str, new_str)
848
849
850# _________________________________________________________________________________________
851#
852#   regex_replace
853#
854# _________________________________________________________________________________________
855
856#
857#   Perform normal regular expression substitution
858#
859REGEX_SUBSTITUTE = 0
860#
861#   An extra peculiar mode to help suffix along:
862#   Suffix regular expression have an implicit capture for everything up to the specified
863#       suffix text
864
865#
866#   By default, replaces the suffix part by adding leading r"\1" to the substitution pattern
867#       If r"\1" is already specified in the pattern, then we presume you know what
868#       you are doing, and will let you get along with it
869#
870SUFFIX_SUBSTITUTE = 1
871
872#
873#   REGEX_SUBSTITUTE is used for suffix() matches in 'extra' arguments (additional to output)
874#   which are strings
875#
876#   Complete replacement happens. If you wish to retain the prefix text
877#       before the suffix, you can do so by adding r"\1"
878#
879
880
881def regex_replace(filename, regex_str, compiled_regex, substitution_patterns, regex_or_suffix=REGEX_SUBSTITUTE):
882    return apply_func_to_sequence(substitution_patterns, t_regex_replace(filename, regex_str, compiled_regex, regex_or_suffix))
883
884
885def formatter_replace(filenames, regex_str, compiled_regex, substitution_patterns):
886    return apply_func_to_sequence(substitution_patterns, t_formatter_replace(filenames, regex_str, compiled_regex))
887
888
889def nested_formatter_replace(filenames, regex_strings, compiled_regexes, substitution_patterns):
890    return apply_func_to_sequence(substitution_patterns, t_nested_formatter_replace(filenames, regex_strings, compiled_regexes))
891
892
893def nested_string_replace(prev_str, new_str, substitution_patterns):
894    return apply_func_to_sequence(substitution_patterns, t_nested_string_replace(prev_str, new_str))
895
896
897# _________________________________________________________________________________________
898
899#   non_str_sequence
900
901# _________________________________________________________________________________________
902def non_str_sequence(arg):
903    """
904    Whether arg is a sequence.
905    We treat strings / dicts however as a singleton not as a sequence
906
907    """
908    # will only dive into list and set, everything else is not regarded as a sequence
909    # loss of flexibility but more conservative
910    # if (isinstance(arg, (basestring, dict, multiprocessing.managers.DictProxy))):
911    if (not isinstance(arg, (list, tuple, set))):
912        return False
913    try:
914        test = iter(arg)
915        return True
916    except TypeError:
917        return False
918
919# _________________________________________________________________________________________
920
921#   get_strings_in_flattened_sequence_aux
922
923#       helper function for next function
924
925# _________________________________________________________________________________________
926
927
928def get_strings_in_flattened_sequence_aux(p, l=None):
929    """
930    Unravels arbitrarily nested sequence and returns lists of strings
931    """
932    if l is None:
933        l = []
934    if isinstance(p, path_str_type):
935        l.append(p)
936    elif non_str_sequence(p):
937        for pp in p:
938            get_strings_in_flattened_sequence_aux(pp, l)
939    return l
940
941
942# _________________________________________________________________________________________
943
944#   non_str_sequence
945
946# _________________________________________________________________________________________
947def get_strings_in_flattened_sequence(p):
948    """
949    Traverses nested sequence and for each element, returns first string encountered
950    """
951    if p is None:
952        return []
953
954    #
955    #  string is returned as list of single string
956    #
957    if isinstance(p, path_str_type):
958        return [p]
959
960    #
961    #  Get all strings flattened into list
962    #
963    return get_strings_in_flattened_sequence_aux(p)
964
965
966# _________________________________________________________________________________________
967
968#   get_first_string_in_nested_sequence
969
970# _________________________________________________________________________________________
971def get_first_string_in_nested_sequence(p):
972    strings = get_strings_in_flattened_sequence(p)
973    if len(strings):
974        return strings[0]
975    return None
976
977
978#
979#   TODOOO third object could be a dict or a list
980#
981# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
982
983#   Encoders: turn objects and filenames into a more presentable format
984
985# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
986def ignore_unknown_encoder(obj):
987    if non_str_sequence(obj):
988        return "[%s]" % ", ".join(map(ignore_unknown_encoder, obj))
989    try:
990        s = str(obj)
991        if " object" in s and s[0] == '<' and s[-1] == '>':
992            pos = s.find(" object")
993            s = "<" + s[1:pos].replace("__main__.", "") + ">"
994        return s.replace('"', "'")
995    except:
996        return "<%s>" % str(obj.__class__).replace('"', "'")
997
998# _________________________________________________________________________________________
999#
1000#   shorten_filenames_encoder
1001# ________________________________________________________________________________________
1002
1003
1004def shorten_filenames_encoder(obj, n_levels=2):
1005    """
1006    Convert a set of parameters into a string
1007        Paths with > N levels of nested-ness are truncated
1008    """
1009
1010    #
1011    #   if < 0, nest by 2
1012    #
1013    if n_levels < 0:
1014        desired_len = -n_levels
1015        prev_encoded_len = 0
1016        #
1017        #   try more and more nestedness up to 9 if that fits inside desired length
1018        #       stop when increasing nestedness makes no difference
1019        #
1020        for nestedness in range(1, 20):
1021            res = shorten_filenames_encoder(obj, nestedness)
1022            if len(res) > desired_len or "..." not in res:
1023                break
1024            prev_encoded_len = len(res)
1025        desired_len = max(4, desired_len - 5)
1026        offset = len(res) - desired_len
1027        if offset < 0:
1028            return res
1029        return "<???> " + res[offset:]
1030
1031    #
1032    #   Recurse into lists and tuples
1033    #
1034    if non_str_sequence(obj):
1035        return "[%s]" % ", ".join(map(shorten_filenames_encoder, obj, [n_levels] * len(obj)))
1036
1037    #
1038    #   Only shorten strings
1039    #
1040    if not isinstance(obj, path_str_type):
1041        return ignore_unknown_encoder(obj)
1042
1043    #
1044    #   level = 0 means return full absolute path
1045    #
1046    if not n_levels:
1047        return ignore_unknown_encoder(os.path.abspath(obj))
1048
1049    #
1050    # Shorten both relative and absolute (full) paths
1051    #
1052
1053    # if within bounds, return that
1054    if obj[1:].count('/') < n_levels:
1055        return ignore_unknown_encoder(obj)
1056
1057    # use relative path if that has <= nested level
1058    rel_path = os.path.relpath(obj)
1059    if rel_path.count('/') <= n_levels:
1060        #print >>sys.stderr, "relative path only one nested level"
1061        return ignore_unknown_encoder(rel_path)
1062
1063    # get last N nested levels
1064    # print >>sys.stderr, "full path last N nested level"
1065    return ignore_unknown_encoder(get_nth_nested_level_of_path(obj, n_levels))
1066
1067
1068# _________________________________________________________________________________________
1069#
1070#   get_tasks_filename_globs_in_nested_sequence
1071#
1072# ________________________________________________________________________________________
1073glob_letters = set('*[]?')
1074
1075
1076def is_glob(s):
1077    """Check whether 's' contains ANY of glob chars"""
1078    return len(glob_letters.intersection(s)) > 0
1079
1080
1081# _________________________________________________________________________________________
1082#
1083#   get_nested_tasks_or_globs
1084#
1085# ________________________________________________________________________________________
1086def get_nested_tasks_or_globs(p, treat_strings_as_tasks=False, runtime_data_names=None, tasks=None, globs=None):
1087    """
1088    Get any tasks or globs which are within parameter
1089        tasks are returned as functions or function names
1090    """
1091    #
1092    # create storage if this is not a recursive call
1093    #
1094    if globs is None:
1095        runtime_data_names, tasks, globs = set(), list(), set()
1096
1097    #
1098    #   task function
1099    #
1100    if isinstance(p, Callable):
1101        tasks.append(p)
1102    elif p.__class__.__name__ == 'Task' or p.__class__.__name__ == 'Pipeline':
1103        tasks.append(p)
1104    elif isinstance(p, runtime_parameter):
1105        runtime_data_names.add(p)
1106
1107    #
1108    #   output_from treats all arguments as tasks or task names
1109    #
1110    elif isinstance(p, output_from):
1111        for pp in p.args:
1112            get_nested_tasks_or_globs(
1113                pp, True, runtime_data_names, tasks, globs)
1114
1115    elif isinstance(p, path_str_type):
1116        if treat_strings_as_tasks:
1117            tasks.append(p)
1118        elif is_glob(p):
1119            globs.add(p)
1120
1121    elif non_str_sequence(p):
1122        for pp in p:
1123            get_nested_tasks_or_globs(
1124                pp, treat_strings_as_tasks, runtime_data_names, tasks, globs)
1125    return tasks, globs, runtime_data_names
1126
1127# _________________________________________________________________________________________
1128#
1129#   replace_placeholders_with_tasks_in_input_params
1130#
1131# ________________________________________________________________________________________
1132
1133
1134def replace_placeholders_with_tasks_in_input_params(p, func_or_name_to_task, treat_strings_as_tasks=False):
1135    """
1136    Replaces task functions or task name (strings) with the tasks they represent
1137    Also replaces Tasks and Pipelines with the correct Tasks
1138    func_or_name_to_task are a dictionary of function and task names to tasks
1139
1140    """
1141    if p.__class__.__name__ == 'Pipeline':
1142        return func_or_name_to_task["PIPELINE=%s=PIPELINE" % p.name]
1143
1144    if p.__class__.__name__ == 'Task' and p in func_or_name_to_task:
1145        return func_or_name_to_task[p]
1146
1147    #
1148    # Expand globs or tasks as a list only if they are top level
1149    #
1150    if isinstance(p, Callable):
1151        # if type(p) == types.FunctionType:
1152        return func_or_name_to_task[p]
1153
1154    #
1155    #   output_from treats all arguments as tasks or task names
1156    #
1157    if isinstance(p, output_from):
1158        if len(p.args) == 1:
1159            return replace_placeholders_with_tasks_in_input_params(p.args[0], func_or_name_to_task, True)
1160        else:
1161            return [replace_placeholders_with_tasks_in_input_params(pp, func_or_name_to_task, True) for pp in p.args]
1162
1163    #
1164    # strings become tasks if treat_strings_as_tasks
1165    #
1166    if isinstance(p, path_str_type):
1167        if treat_strings_as_tasks:
1168            return func_or_name_to_task[p]
1169        return p
1170
1171    #
1172    # No conversions within dictionaries
1173    #
1174    if isinstance(p, dict):
1175        return p
1176
1177    #
1178    # Other sequences are recursed down
1179    #
1180    elif non_str_sequence(p):
1181        l = list()
1182        for pp in p:
1183
1184            #
1185            #   To be intuitive:
1186            #   arguments wrapped by output_from are always treated "in-line"
1187            #           e.g. 1, output_from("a") => 1, task_a
1188            #           e.g. 1, output_from("a", 2) => 1, task_a, 2
1189            #
1190            if isinstance(pp, output_from):
1191                if len(pp.args) > 1:
1192                    l.extend(tuple(replace_placeholders_with_tasks_in_input_params(
1193                        pp, func_or_name_to_task, True)))
1194                elif len(pp.args) == 1:
1195                    l.append(replace_placeholders_with_tasks_in_input_params(
1196                        pp.args[0], func_or_name_to_task, True))
1197                # else len(pp.args) == 0 !! do nothing
1198
1199            else:
1200                l.append(replace_placeholders_with_tasks_in_input_params(
1201                    pp, func_or_name_to_task, treat_strings_as_tasks))
1202        return type(p)(l)
1203
1204    #
1205    # No expansions of non-string/non-sequences
1206    #
1207    else:
1208        return p
1209
1210# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
1211
1212#   compiling regular expressions
1213
1214# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
1215# _________________________________________________________________________________________
1216
1217#   suffix
1218
1219# _________________________________________________________________________________________
1220
1221
1222class suffix(object):
1223    def __init__(self, *args):
1224        self.args = args
1225
1226    def __repr__(self, *args):
1227        return 'suffix%r' % (self.args,)
1228
1229# _________________________________________________________________________________________
1230
1231#   regex
1232
1233# _________________________________________________________________________________________
1234
1235
1236class regex(object):
1237    def __init__(self, *args):
1238        self.args = args
1239
1240    def __repr__(self, *args):
1241        return 'regex%r' % (self.args,)
1242
1243# _________________________________________________________________________________________
1244
1245#   regex
1246
1247# _________________________________________________________________________________________
1248
1249
1250class formatter(object):
1251    def __init__(self, *args):
1252        self.args = args
1253
1254    def __repr__(self, *args):
1255        return 'formatter%r' % (self.args,)
1256
1257# _________________________________________________________________________________________
1258
1259#   wrap_exception_as_string
1260
1261# _________________________________________________________________________________________
1262
1263
1264def wrap_exception_as_string():
1265    """
1266    return exception as string to be rethrown
1267    """
1268    exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
1269    msg = "%s.%s" % (exceptionType.__module__, exceptionType.__name__)
1270    exception_value = str(exceptionValue)
1271    if len(exception_value):
1272        return msg + ": (%s)" % exception_value
1273    return msg
1274
1275
1276# _________________________________________________________________________________________
1277
1278#   compile_formatter
1279
1280# _________________________________________________________________________________________
1281def compile_formatter(enclosing_task, formatter_obj, error_object, descriptor_string):
1282    """
1283    Given list of [string|None]
1284    Return compiled regular expressions.
1285    """
1286
1287    compiled_regexes = []
1288    for ss in formatter_obj.args:
1289        # ignore None
1290        if ss is None:
1291            compiled_regexes.append(None)
1292            continue
1293
1294        formatter_args = str(formatter_obj.args)[1:-1]
1295        # regular expression should be strings
1296        if not isinstance(ss, path_str_type):
1297            raise error_object(enclosing_task, ("{descriptor_string}: "
1298                                                "formatter({formatter_args}) is malformed\n"
1299                                                "formatter(...) should only be used to wrap "
1300                                                'regular expression strings or None (not "{ss}")')
1301                               .format(descriptor_string=descriptor_string,
1302                                       formatter_args=formatter_args,
1303                                       ss=ss)
1304                               )
1305
1306        try:
1307            compiled_regexes.append(re.compile(ss))
1308        except:
1309            raise error_object(enclosing_task, ("{descriptor_string}: "
1310                                                "in formatter({formatter_args}) \n"
1311                                                'regular expression "{ss}" is malformed\n'
1312                                                "[{except_str}]")
1313                               .format(descriptor_string=descriptor_string,
1314                                       formatter_args=formatter_args,
1315                                       ss=ss,
1316                                       except_str=wrap_exception_as_string())
1317                               )
1318    return compiled_regexes
1319
1320
1321# _________________________________________________________________________________________
1322
1323#   compile_regex
1324
1325# _________________________________________________________________________________________
1326def compile_regex(enclosing_task, regex, error_object, descriptor_string, regex_object_name="regex"):
1327    """
1328    throw error unless regular expression compiles
1329    """
1330    if not len(regex.args) or len(regex.args) > 1 or not isinstance(regex.args[0], path_str_type):
1331        regex_str = str(regex.args)
1332        if len(regex.args) > 1:
1333            regex_str = regex_str[1:-1]
1334        elif len(regex.args) == 0:
1335            regex_str = ''
1336        else:
1337            regex_str = regex_str
1338        raise error_object(enclosing_task, ("{descriptor_string}: "
1339                                            "{regex_object_name}({regex_str}) is malformed\n"
1340                                            "{regex_object_name}(...) should only be used to wrap a single regular expression string")
1341                           .format(descriptor_string=descriptor_string,
1342                                   regex_str=regex_str,
1343                                   regex_object_name=regex_object_name)
1344                           )
1345    try:
1346        matching_regex = re.compile(regex.args[0])
1347        return matching_regex
1348    except:
1349        raise error_object(enclosing_task, ("{descriptor_string}: "
1350                                            "regular expression {regex_object_name}('{regex_str}') is malformed\n"
1351                                            "[{except_str}]")
1352                           .format(descriptor_string=descriptor_string,
1353                                   regex_object_name=regex_object_name,
1354                                   regex_str=regex.args[0],
1355                                   except_str=wrap_exception_as_string())
1356                           )
1357
1358# _________________________________________________________________________________________
1359
1360#   compile_suffix
1361
1362# _________________________________________________________________________________________
1363
1364
1365def compile_suffix(enclosing_task, regex, error_object, descriptor_string):
1366    """
1367    throw error unless regular expression compiles
1368    """
1369    if not len(regex.args):
1370        raise error_object(enclosing_task, "%s: " % descriptor_string +
1371                           "suffix() is malformed.\n" +
1372                           "suffix(...) should be used to wrap a string matching the suffices of file names")
1373    if len(regex.args) > 1 or not isinstance(regex.args[0], path_str_type):
1374        raise error_object(enclosing_task, "%s: " % descriptor_string +
1375                           "suffix('%s') is malformed.\n" % (regex.args,) +
1376                           "suffix(...) should only be used to wrap a single string matching the suffices of file names")
1377    try:
1378        matching_regex = re.compile("(.*)" + re.escape(regex.args[0]) + "$")
1379        return matching_regex
1380    except:
1381        raise error_object(enclosing_task, "%s: " % descriptor_string +
1382                           "suffix('%s') is somehow malformed\n" % regex.args[0] +
1383                           "[%s]" % wrap_exception_as_string())
1384
1385# _________________________________________________________________________________________
1386
1387#   check_parallel_parameters
1388
1389# _________________________________________________________________________________________
1390
1391
1392def check_parallel_parameters(enclosing_task, params, error_object):
1393    """
1394    Helper function for @files
1395    Checks format of parameters and
1396    whether there are input and output files specified for each job
1397    """
1398    if not len(params):
1399        raise Exception("@parallel parameters is empty.")
1400
1401    for job_param in params:
1402        if isinstance(job_param, path_str_type):
1403            message = ("Wrong syntax for @parallel.\n"
1404                       "@parallel(%s)\n" % ignore_unknown_encoder(params) +
1405                       "If you are supplying parameters for a task "
1406                       "running as a single job, "
1407                       "either don't put enclosing brackets at all (with each parameter "
1408                       "separated by commas) or enclose all parameters as a nested list of "
1409                       "lists, e.g. [['input', 'output' ...]]. "
1410                       )
1411            raise error_object(enclosing_task, message)
1412
1413
1414# _________________________________________________________________________________________
1415
1416#   check_files_io_parameters
1417
1418# _________________________________________________________________________________________
1419def check_files_io_parameters(enclosing_task, params, error_object):
1420    """
1421    Helper function for @files
1422    Checks format of parameters and
1423    whether there are input and output files specified for each job
1424    """
1425    # if not len(params):
1426    #    raise Exception("@files I/O parameters is empty.")
1427
1428    try:
1429        for job_param in params:
1430            if isinstance(job_param, path_str_type):
1431                raise TypeError
1432
1433            if len(job_param) < 1:
1434                raise error_object(enclosing_task, "Missing input files for job " +
1435                                   ignore_unknown_encoder(job_param))
1436            if len(job_param) < 2:
1437                raise error_object(enclosing_task, "Missing output files for job " +
1438                                   ignore_unknown_encoder(job_param))
1439            # if len(get_strings_in_flattened_sequence(job_param[0:2])) == 0:
1440            #    raise error_object(enclosing_task, "Input or output file parameters should "
1441            #                                        "contain at least one or more file names strings. "
1442            #                                        "Consider using @parallel if you are not using files. " +
1443            #                                        ignore_unknown_encoder(job_param))
1444    except TypeError:
1445        #
1446        # job_param was not a list
1447        #
1448        message = ("Wrong syntax for @files.\n@files(%s)\n" % ignore_unknown_encoder(params) +
1449                   "If you are supplying parameters for a task "
1450                   "running as a single job, "
1451                   "either don't put enclosing brackets at all (with each parameter "
1452                   "separated by commas) or enclose all parameters as a nested list of "
1453                   "lists, e.g. [['input', 'output' ...]]. "
1454                   )
1455        raise error_object(enclosing_task, message)
1456
1457# _________________________________________________________________________________________
1458#
1459#   expand_nested_tasks_or_globs
1460#
1461# ________________________________________________________________________________________
1462
1463
1464def expand_nested_tasks_or_globs(p, tasksglobs_to_filenames):
1465    """
1466    Expand globs and tasks "in-line", unless they are the top level, in which case turn
1467    it into a list
1468
1469    N.B. Globs are only expanded if they are in tasksglobs_to_filenames
1470         This function is called for @split descriptors which leave output globs untouched
1471         for clarity. Thanks to Noah Spies for spotting this!
1472    """
1473
1474    #
1475    # Expand globs or tasks as a list only if they are top level
1476    #
1477    if ((isinstance(p, path_str_type) and is_glob(p) and p in tasksglobs_to_filenames) or
1478            p.__class__.__name__ == 'Task' or
1479            isinstance(p, runtime_parameter)):
1480        return tasksglobs_to_filenames[p]
1481
1482    #
1483    # No expansions of strings and dictionaries
1484    #
1485    if isinstance(p, (path_str_type, dict)):
1486        return p
1487
1488    #
1489    # Other sequences are recursed down
1490    #
1491    elif non_str_sequence(p):
1492        l = list()
1493        for pp in p:
1494            if (isinstance(pp, path_str_type) and pp in tasksglobs_to_filenames):
1495                l.extend(tasksglobs_to_filenames[pp])
1496            elif pp.__class__.__name__ == 'Task' or isinstance(pp, runtime_parameter):
1497                files = tasksglobs_to_filenames[pp]
1498                # task may have produced a single output: in which case append
1499                if non_str_sequence(files):
1500                    l.extend(files)
1501                else:
1502                    l.append(files)
1503            else:
1504                l.append(expand_nested_tasks_or_globs(
1505                    pp, tasksglobs_to_filenames))
1506        return type(p)(l)
1507
1508    #
1509    # No expansions of non-string/non-sequences
1510    #
1511    else:
1512        return p
1513
1514
1515# _________________________________________________________________________________________
1516
1517#   get_parsed_arguments_str_for_errors
1518
1519#       helper funciton for parse_task_arguments()
1520
1521# _________________________________________________________________________________________
1522def get_parsed_arguments_str_for_errors(task_description, bad_arg_str, unnamed_result_strs, named_result_strs):
1523    """
1524    Helper function for parse_task_arguments
1525        Prints out offending argument (bad_arg_str) in the context of already parsed
1526        arguments so that we can quickly figure out where the error is coming from
1527    """
1528    indent = task_description.find("(") + 1
1529    parsed_arg_str = ", ".join(unnamed_result_strs + named_result_strs)
1530    # make function names clearer in arg list
1531    parsed_arg_str = re.sub(
1532        r"<function (\w+) at 0x[0-9a-f]+>", r"\1", parsed_arg_str)
1533    return "\n" + task_description % (parsed_arg_str + ", ...\n" +
1534                                      # mark out problem
1535                                      (" " * (indent-5 if indent - 5 > 0 else 0)) + "===> " +
1536                                      bad_arg_str)
1537
1538
1539# _________________________________________________________________________________________
1540
1541#   parse_task_arguments
1542
1543# _________________________________________________________________________________________
1544def parse_task_arguments(orig_unnamed_arguments, orig_named_arguments, expected_arguments, task_description):
1545    """
1546    Parse arguments parsed into decorators or Pipeline.transform etc.
1547        Special handling for optional arguments in the middle of argument list
1548            1) @product
1549                can have (input, filter, input1, filter1, input2, filter2....)
1550            2) @transform, @subdivide, @collate, @product, @combinatorics which have
1551                    (..., [add_inputs(...)|inputs(...)],...)
1552                    or ([add_inputs=...|replace_inputs=...])
1553                    or ([add_inputs=add_inputs(...)|replace_inputs=inputs(...)])
1554        Special handling for variable number of arguments at the end of the argument list
1555            which all become "extras"
1556
1557    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1558    #
1559    #   N.B. Missing non-mandatory arguments are returned as an empty list
1560    #
1561    #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1562
1563                """
1564    results = {}
1565    unnamed_arguments = list(orig_unnamed_arguments)
1566    named_arguments = dict(orig_named_arguments)
1567    # parsed results in string form for error messages
1568    unnamed_result_strs = []
1569    named_result_strs = []
1570
1571    def parse_add_inputs_args(parsed_arg, input_type, arg_name, modify_inputs_mode, result_strs):
1572        """
1573        Parse arguments for add_inputs and replace_inputs, i.e. 'inputs()' and 'add_inputs()'
1574            input_type =inputs|add_inputs
1575            arg_name = replace_inputs | add_inputs
1576            modify_inputs_mode = t_extra_inputs.REPLACE_INPUTS| t_extra_inputs.ADD_TO_INPUTS
1577        """
1578        results["modify_inputs_mode"] = modify_inputs_mode
1579        if input_type == inputs:
1580            # inputs() only takes a single argument. Throw error otherwise
1581            if len(parsed_arg.args) != 1:
1582                err_msg = "inputs() expects a single argument:\n%s" % (
1583                    get_parsed_arguments_str_for_errors(task_description,   # bad arg in context of parsed
1584                                                        "%s%r" % (input_type.__name__, tuple(
1585                                                            parsed_arg.args)),
1586                                                        unnamed_result_strs,
1587                                                        named_result_strs))
1588                raise error_inputs_multiple_args(err_msg)
1589            # unpack add_inputs / inputs and save results
1590            results["modify_inputs"] = parsed_arg.args[0]
1591        else:
1592            results["modify_inputs"] = parsed_arg.args
1593        result_strs.append("%s=%r" % (arg_name, parsed_arg.args))
1594
1595    def check_argument_type(arg_name, parsed_arg, argument_types):
1596        """
1597        check if parsed_arg is right type
1598        """
1599        if argument_types and not isinstance(parsed_arg, argument_types):
1600            err_msg = ("The '%s' argument should be %s:\n%s" %
1601                       (arg_name,                                                  # argument name
1602                        # type names
1603                        " or ".join("%s" %
1604                                    tt.__name__ for tt in argument_types),
1605                        get_parsed_arguments_str_for_errors(task_description,       # bad arg in context of parsed
1606                                                            "%s = %r" % (
1607                                                                arg_name, parsed_arg),
1608                                                            unnamed_result_strs, named_result_strs)))
1609            #print (err_msg, file=sys.stderr)
1610            raise TypeError(err_msg)
1611
1612        return parsed_arg
1613
1614    def parse_argument(arg_name, expected_arguments, unnamed_arguments, named_arguments,
1615                       results, task_description, mandatory, argument_types=None):
1616        """
1617        All missing, non-mandatory are empty list
1618        """
1619
1620        # ignore if not on list
1621        if not arg_name in expected_arguments:
1622            return
1623
1624        #
1625        # look among unnamed arguments first
1626        #
1627        if len(unnamed_arguments):
1628            # check correct type
1629            parsed_arg = check_argument_type(
1630                arg_name, unnamed_arguments[0], argument_types)
1631            # save parsed results
1632            results[arg_name] = parsed_arg
1633            unnamed_result_strs.append("%s=%r" % (arg_name, parsed_arg))
1634            del unnamed_arguments[0]
1635
1636        #
1637        # then named
1638        #
1639        elif arg_name in named_arguments:
1640            #
1641            # check correct type
1642            #
1643            parsed_arg = check_argument_type(
1644                arg_name, named_arguments[arg_name], argument_types)
1645            #
1646            #   Save parsed results
1647            #
1648            results[arg_name] = parsed_arg
1649            named_result_strs.append("%s=%r" % (arg_name, parsed_arg))
1650            del named_arguments[arg_name]
1651
1652        #
1653        # complain or ignore missing?
1654        #
1655        else:
1656            if mandatory:
1657                err_msg = "Missing '%s' argument:\n%s" % (
1658                    arg_name,                                               # argument name
1659                    get_parsed_arguments_str_for_errors(task_description,   # bad arg in
1660                                                        arg_name + " = ???",  # context of parsed
1661                                                        unnamed_result_strs,
1662                                                        named_result_strs))
1663                #print (err_msg, file=sys.stderr)
1664                raise error_missing_args(err_msg)
1665
1666            #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1667            #
1668            #   N.B. Missing non-mandatory arguments are returned as an empty list
1669            #
1670            #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
1671            else:
1672                results[arg_name] = []
1673
1674    #
1675    #   Missing input is empty list
1676    #
1677    parse_argument('input', expected_arguments, unnamed_arguments,
1678                   named_arguments, results, task_description, mandatory=False)
1679
1680    #
1681    #   filter is mandatory if expected
1682    #
1683    parse_argument('filter', expected_arguments, unnamed_arguments, named_arguments, results,
1684                   task_description, mandatory=True, argument_types=(formatter, regex, suffix))
1685
1686    if "filter" in results:
1687        if isinstance(results["filter"], suffix):
1688            parse_argument("output_dir", expected_arguments, [
1689            ], named_arguments, results, task_description, mandatory=False)
1690
1691    #
1692    #   inputN
1693    #
1694    if 'inputN' in expected_arguments:
1695        #
1696        # put already parsed input and filter into the list
1697        #
1698        results["input"] = [results["input"]]
1699        results["filter"] = [results["filter"]]
1700        cnt_inputN = 1
1701        #
1702        #   parse argument pairs at a time, so long as the second argument is a formatter
1703        #
1704        while len(unnamed_arguments) >= 2 and isinstance(unnamed_arguments[1], formatter):
1705            filter_name = "filter%d" % (cnt_inputN + 1)
1706            input_name = "input%d" % (cnt_inputN + 1)
1707            unnamed_result_strs.append("%s=%r" % (
1708                input_name, unnamed_arguments[0]))
1709            unnamed_result_strs.append("%s=%r" % (
1710                filter_name, unnamed_arguments[1]))
1711            results["input"].append(unnamed_arguments[0])
1712            results["filter"].append(unnamed_arguments[1])
1713            cnt_inputN += 1
1714            del unnamed_arguments[0:2]
1715
1716        #
1717        #   parse named arguments while there is a filter2, filter3 etc.
1718        #
1719        filter_name = "filter%d" % (cnt_inputN + 1)
1720        input_name = "input%d" % (cnt_inputN + 1)
1721        while filter_name in named_arguments:
1722            results["filter"].append(named_arguments[filter_name])
1723            named_result_strs.append("%s=%r" % (
1724                filter_name, named_arguments[filter_name]))
1725            del named_arguments[filter_name]
1726            #   parse input2, input3 or leave blank as empty list
1727            if input_name in named_arguments:
1728                results["input"].append(named_arguments[input_name])
1729                named_result_strs.append("%s=%r" % (
1730                    input_name, named_arguments[input_name]))
1731                del named_arguments[input_name]
1732            else:
1733                results["input"].append([])
1734            cnt_inputN += 1
1735            filter_name = "filter%d" % (cnt_inputN + 1)
1736            input_name = "input%d" % (cnt_inputN + 1)
1737
1738    #
1739    #   tuple size is int and mandatory if exists
1740    #
1741    parse_argument('tuple_size', expected_arguments, unnamed_arguments, named_arguments,
1742                   results, task_description, mandatory=True, argument_types=(int,))
1743
1744    #
1745    #   add_inputs / inputs are optional
1746    #
1747    if 'modify_inputs' in expected_arguments:
1748        results["modify_inputs_mode"] = t_extra_inputs.KEEP_INPUTS
1749        results["modify_inputs"] = None
1750        parse_add_inputs = ((inputs, "inputs", "replace_inputs", t_extra_inputs.REPLACE_INPUTS),
1751                            (add_inputs, "add_inputs", "add_inputs", t_extra_inputs.ADD_TO_INPUTS))
1752
1753        if len(unnamed_arguments):
1754            #
1755            #   Is add_inputs or inputs in unnamed arguments?
1756            #       Parse out contents and save in results["replace_inputs"] or results["add_inputs"]
1757            #
1758            for input_type, input_type_name, arg_name, modify_inputs_mode in parse_add_inputs:
1759                parsed_arg = unnamed_arguments[0]
1760                if isinstance(parsed_arg, input_type):
1761                    parse_add_inputs_args(
1762                        parsed_arg, input_type, arg_name, modify_inputs_mode, unnamed_result_strs)
1763                    del unnamed_arguments[0]
1764                    break
1765        #
1766        #   Otherwise is add_inputs or inputs in named arguments?
1767        #       Parse out contents only if necessary and save in results["replace_inputs"] or results["add_inputs"]
1768        #
1769        if results["modify_inputs_mode"] == t_extra_inputs.KEEP_INPUTS:
1770            for input_type, input_type_name, arg_name, modify_inputs_mode in parse_add_inputs:
1771                if arg_name in named_arguments:
1772                    parsed_arg = named_arguments[arg_name]
1773                    if isinstance(parsed_arg, input_type):
1774                        parse_add_inputs_args(
1775                            parsed_arg, input_type, arg_name, modify_inputs_mode, named_result_strs)
1776                    else:
1777                        results["modify_inputs"] = parsed_arg
1778                    results["modify_inputs_mode"] = modify_inputs_mode
1779                    del named_arguments[arg_name]
1780                    break
1781
1782    #
1783    #   output is mandatory if exists
1784    #
1785    parse_argument('output', expected_arguments, unnamed_arguments,
1786                   named_arguments, results, task_description, mandatory=True)
1787
1788    #
1789    #   extras is mandatory if exists
1790    #
1791    if 'extras' in expected_arguments:
1792        results['extras'] = []
1793        results['named_extras'] = {}
1794        if len(unnamed_arguments):
1795            # move list to results: remember python does shallow copies of lists
1796            results['extras'] = unnamed_arguments
1797            unnamed_result_strs.append("%s=%r" % ("extras", unnamed_arguments))
1798            unnamed_arguments = []
1799            #del unnamed_arguments[:]
1800        elif 'extras' in named_arguments:
1801            # Named extras only
1802            if isinstance(named_arguments['extras'], dict):
1803                results["named_extras"] = named_arguments['extras']
1804            # Unnamed extras only
1805            elif isinstance(named_arguments['extras'], list):
1806                results["extras"] = named_arguments['extras']
1807            # Wrong type: blow up
1808            else:
1809                err_msg = ("The extras paramter must be either a list of values\nor a dictionary of named parameter values:\n%s" %
1810                           get_parsed_arguments_str_for_errors(task_description,
1811                                                               "extras=%r" % (
1812                                                                   named_arguments['extras'],),
1813                                                               unnamed_result_strs,
1814                                                               named_result_strs))
1815                raise error_extras_wrong_type(err_msg)
1816
1817            named_result_strs.append("%s=%r" % (
1818                "extras", named_arguments['extras']))
1819            del named_arguments['extras']
1820
1821    if len(unnamed_arguments):
1822        err_msg = ("Too many unnamed arguments leftover:\n%s" %
1823                   get_parsed_arguments_str_for_errors(task_description,       # bad arg in context of parsed
1824                                                       (", ".join(("%r" % a)
1825                                                                  for a in unnamed_arguments)),
1826                                                       unnamed_result_strs, named_result_strs))
1827        #print (err_msg, file=sys.stderr)
1828        raise error_too_many_args(err_msg)
1829    if len(named_arguments):
1830        err_msg = ("Duplicate, conflicting or unrecognised arguments:\n%s" %
1831                   get_parsed_arguments_str_for_errors(task_description,       # bad arg in context of parsed
1832                                                       ", ".join("%s=%r" % (
1833                                                           k, v) for k, v in named_arguments.items()),
1834                                                       unnamed_result_strs, named_result_strs))
1835        #print (err_msg, file=sys.stderr)
1836        raise error_too_many_args(err_msg)
1837
1838    return results
1839
1840# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
1841
1842#   special markers used by @files_re
1843
1844# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
1845
1846
1847class combine(object):
1848    def __init__(self, *args):
1849        self.args = args
1850
1851
1852class output_from(object):
1853    def __init__(self, *args):
1854        self.args = args
1855
1856    def __repr__(self, *args):
1857        return 'output_from%r' % (self.args,)
1858
1859
1860class runtime_parameter(object):
1861    def __init__(self, *args):
1862        if len(args) != 1 or not isinstance(args[0], path_str_type):
1863            raise Exception(
1864                "runtime_parameter takes the name of the run time parameter as a single string")
1865        self.args = args
1866
1867
1868# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
1869
1870#   Unit Testing code in test/test_ruffus_utility.py
1871
1872
1873# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888
1874