1from __future__ import print_function 2import copy 3import os 4from itertools import chain 5from . import dbdict 6import operator 7import marshal 8import hashlib 9import multiprocessing.managers 10from collections import defaultdict 11from .ruffus_exceptions import * 12from functools import reduce 13import glob 14import types 15import sys 16import re 17if sys.hexversion < 0x03000000: 18 from future_builtins import zip 19################################################################################ 20# 21# ruffus_utility.py 22# 23# 24# Copyright (c) 10/9/2009 Leo Goodstadt 25# 26# Permission is hereby granted, free of charge, to any person obtaining a copy 27# of this software and associated documentation files (the "Software"), to deal 28# in the Software without restriction, including without limitation the rights 29# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 30# copies of the Software, and to permit persons to whom the Software is 31# furnished to do so, subject to the following conditions: 32# 33# The above copyright notice and this permission notice shall be included in 34# all copies or substantial portions of the Software. 35# 36# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 37# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 38# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 39# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 40# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 41# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 42# THE SOFTWARE. 43################################################################################# 44 45 46""" 47 48******************************************** 49:mod:`ruffus_utility` -- Overview 50******************************************** 51 52 53.. moduleauthor:: Leo Goodstadt <ruffus@llew.org.uk> 54 55 Common utility functions 56 57 58""" 59 60 61# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 62 63# imports 64 65 66# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 67#import task 68try: 69 from collections.abc import Callable 70except ImportError: 71 from collections import Callable 72try: 73 import cPickle as pickle 74except: 75 import pickle as pickle 76if sys.hexversion >= 0x03000000: 77 # everything is unicode in python3 78 path_str_type = str 79else: 80 path_str_type = basestring 81 82# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 83 84# Constants 85 86 87# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 88 89# 90# file to store history out to 91# 92RUFFUS_HISTORY_FILE = '.ruffus_history.sqlite' 93# If DEFAULT_RUFFUS_HISTORY_FILE is specified in the environment variables, use that instead 94if "DEFAULT_RUFFUS_HISTORY_FILE" in os.environ: 95 RUFFUS_HISTORY_FILE = os.environ["DEFAULT_RUFFUS_HISTORY_FILE"] 96 97 98# only rerun when the file timestamps are out of date (classic mode) 99CHECKSUM_FILE_TIMESTAMPS = 0 100# also rerun when the history shows a job as being out of date 101CHECKSUM_HISTORY_TIMESTAMPS = 1 102CHECKSUM_FUNCTIONS = 2 # also rerun when function body has changed 103# also rerun when function parameters or function body change 104CHECKSUM_FUNCTIONS_AND_PARAMS = 3 105 106CHECKSUM_REGENERATE = 2 # regenerate checksums 107 108 109# number of times to check if an input file exists 110FILE_CHECK_RETRY = 5 111# number of seconds to sleep before retrying a file check 112FILE_CHECK_SLEEP = 10 113# _________________________________________________________________________________________ 114 115# t_extra_inputs 116# namespaced enum 117 118# _________________________________________________________________________________________ 119 120 121class t_extra_inputs: 122 (ADD_TO_INPUTS, REPLACE_INPUTS, KEEP_INPUTS, KEEP_OUTPUTS) = list(range(4)) 123 124 125class inputs(object): 126 def __init__(self, *args): 127 self.args = args 128 129 def __repr__(self, *args): 130 return 'inputs%r' % (self.args,) 131 132 133class add_inputs(object): 134 def __init__(self, *args): 135 self.args = args 136 137 def __repr__(self, *args): 138 return 'add_inputs%r' % (self.args,) 139 140 141def get_default_checksum_level(): 142 """ 143 Use the checksum level from the environmental variable DEFAULT_RUFFUS_CHECKSUM_LEVEL 144 Otherwise default to CHECKSUM_HISTORY_TIMESTAMPS 145 """ 146 147 # 148 # environmental variable not set 149 # 150 if "DEFAULT_RUFFUS_CHECKSUM_LEVEL" not in os.environ: 151 return CHECKSUM_HISTORY_TIMESTAMPS 152 153 # 154 # lookup value from list of CHECKSUM_XXX constants 155 # 156 checksum_level = None 157 env_checksum_level = os.environ["DEFAULT_RUFFUS_CHECKSUM_LEVEL"] 158 if len(env_checksum_level) == 1 and env_checksum_level in "0123": 159 checksum_level = int(env_checksum_level) 160 else: 161 global_var = globals() 162 for key in global_var: 163 if key.startswith('CHECKSUM') and global_var[key] == env_checksum_level: 164 checksum_level = value 165 166 # 167 # check environmental variable is valid string 168 # 169 if checksum_level is None: 170 raise error_checksum_level(("The environmental value " 171 "DEFAULT_RUFFUS_CHECKSUM_LEVEL should be: [0-3 | " 172 "CHECKSUM_FILE_TIMESTAMPS | " 173 "CHECKSUM_HISTORY_TIMESTAMPS | " 174 "CHECKSUM_FUNCTIONS | " 175 "CHECKSUM_FUNCTIONS_AND_PARAMS] (rather than '%s') ") 176 % (env_checksum_level,)) 177 178 return checksum_level 179 180 181# _________________________________________________________________________________________ 182 183# open_job_history 184 185# _________________________________________________________________________________________ 186def get_default_history_file_name(): 187 history_file = RUFFUS_HISTORY_FILE 188 # 189 # try path expansion using the main script name 190 # 191 try: 192 import __main__ as main 193 path_parts = path_decomposition(os.path.abspath(main.__file__)) 194 history_file = history_file.format(**path_parts) 195 except Exception: 196 pass 197 return history_file 198 199 200def open_job_history(history_file): 201 """ 202 Given a history file name, opens the correspond sqllite db file and returns the handle 203 """ 204 if not history_file: 205 history_file = get_default_history_file_name() 206 207 return dbdict.open(history_file, picklevalues=True) 208 209 210# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 211 212# Functions 213 214 215# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 216 217class JobHistoryChecksum: 218 """Class to remember exactly how an output file was created and when.""" 219 220 def __str__(self): 221 from time import strftime, gmtime 222 if hasattr(self, "params"): 223 return str([self.outfile, 224 strftime("%d %b %Y %H:%M:%S", gmtime(self.mtime)), 225 self.params, 226 self.task_name 227 ]) 228 else: 229 return strftime("%d %b %Y %H:%M:%S", gmtime(self.mtime)) 230 231 def __init__(self, outfile, mtime, params, task): 232 # filename and modification time 233 self.outfile = outfile 234 self.mtime = mtime 235 236 # Uncomment next two lines to debug: 237 #self.params = params 238 #self.task_name = task._name 239 240 # checksum exact params used to generate this output file 241 self.chksum_params = hashlib.md5(pickle.dumps(params)).hexdigest() 242 # checksum the function bytecode as well as the function context 243 # Don't use func_code alone-- changing the line number of the function, 244 # what global variables are available, etc would all change the checksum 245 if sys.hexversion >= 0x03000000: 246 code = task.user_defined_work_func.__code__ 247 func_defaults = task.user_defined_work_func.__defaults__ 248 else: 249 code = task.user_defined_work_func.func_code 250 func_defaults = task.user_defined_work_func.func_defaults 251 func_code = marshal.dumps(code.co_code) 252 253 # 254 # pickle code very defensively, but hopefully without breaking Jake Biesinger's pipelines! 255 # 256 attributes_to_pickle = [func_defaults, 257 code.co_argcount, 258 code.co_consts, 259 code.co_names, 260 code.co_nlocals, 261 code.co_varnames] 262 263 pickle_results = [] 264 for aa in attributes_to_pickle: 265 # Can't cpickle nested functions: typically blows up with func_code.co_consts 266 try: 267 pickle_results.append(pickle.dumps(aa)) 268 continue 269 except: 270 pass 271 # Marshal seems to be less sensitive: try that 272 try: 273 pickle_results.append(marshal.dumps(aa)) 274 continue 275 except: 276 pass 277 # Just make a string out of the attribute 278 try: 279 pickle_results.append(str(aa)) 280 continue 281 except: 282 pass 283 # OK give up, do nothing: On your head it is 284 285 func_extras = reduce(operator.add, pickle_results) 286 self.chksum_func = hashlib.md5(func_code + func_extras).hexdigest() 287 288 289# _________________________________________________________________________________________ 290# 291# parameter_list_as_string 292# 293# _________________________________________________________________________________________ 294def parameter_list_as_string(parameters): 295 """ 296 Input list of parameters 297 Turn this into a string for display 298 299 E.g. 300 301 """ 302 if parameters is None: 303 return "" 304 elif not isinstance(parameters, list): 305 raise Exception("Unexpected parameter list %s" % (parameters,)) 306 else: 307 return str(parameters)[1:-1] 308 309# _________________________________________________________________________________________ 310# 311# path_decomposition 312# 313# _________________________________________________________________________________________ 314 315 316def path_decomposition(orig_path): 317 """ 318 returns a dictionary identifying the components of a file path: 319 This has the following keys 320 basename: (any) base (file) name of the path not including the extension. No slash included 321 ext: (any) extension of the path including the "." 322 path: a list of subpaths created by removing subdirectory names 323 subdir: a list of subdirectory names from the most nested to the root 324 For example 325 apath = "/a/b/c/d/filename.txt" 326 { 'basename': 'filename', 327 'ext': '.txt' 328 'path': ['/a/b/c/d', '/a/b/c', '/a/b', '/a', '/'], , 329 'subdir': ['d', 'c', 'b', 'a', '/'] 330 } 331 "{path[2]}/changed/{subdir[0]}".format(**res) = '/a/b/changed/d' 332 "{path[3]}/changed/{subdir[1]}".format(**res) = '/a/changed/c' 333 """ 334 def recursive_split(a_path): 335 """ 336 split the path into its subdirectories recursively 337 """ 338 if not len(a_path): 339 return [[], []] 340 if a_path == "/" or a_path == "//": 341 return [[a_path], [a_path]] 342 sub_path_part, sub_dir_part = os.path.split(a_path) 343 if sub_dir_part: 344 sub_path_parts, sub_dir_parts = recursive_split(sub_path_part) 345 return [[a_path] + sub_path_parts, 346 [sub_dir_part] + sub_dir_parts] 347 else: 348 return [[], ["/"]] 349 # 350 if not len(orig_path): 351 return {'path': [], 'basename': '', 'ext': '', 'subdir': []} 352 353 # stop normpath from being too clever and removing initial ./ and terminal slash, turning paths into filenames 354 if orig_path in ["./", "/."]: 355 a_path = orig_path 356 else: 357 a_path = os.path.normpath(orig_path) 358 if orig_path[0:2] == "./" and a_path[0:2] != "./": 359 a_path = "./" + a_path 360 361 if orig_path[-1] == "/" and a_path[-1:] != "/": 362 a_path += "/" 363 364 path_part, file_part = os.path.split(a_path) 365 file_part, ext_part = os.path.splitext(file_part) 366 subpaths, subdirs = recursive_split(path_part) 367 return {'basename': file_part, 368 'ext': ext_part, 369 'subpath': subpaths, 370 'subdir': subdirs, 371 'path': path_part} 372 373 374# _________________________________________________________________________________________ 375# 376# get_nth_nested_level_of_path 377# 378# _________________________________________________________________________________________ 379def get_nth_nested_level_of_path(orig_path, n_levels): 380 """ 381 Return path with up to N levels of subdirectories 382 0 = full path 383 N = 1 : basename 384 N = 2 : basename + one subdirectory 385 386 For example 387 0 /test/this/now/or/not.txt 388 1 not.txt 389 2 or/not.txt 390 3 now/or/not.txt 391 4 this/now/or/not.txt 392 5 test/this/now/or/not.txt 393 6 /test/this/now/or/not.txt 394 7 /test/this/now/or/not.txt 395 """ 396 # FIXME: consider returning full path to make debugging easier or at least 397 # make it optional 398 if not n_levels or n_levels < 0: 399 return orig_path 400 res = path_decomposition(orig_path) 401 basename = res["basename"] + res["ext"] 402 shortened_path = os.path.join( 403 *(list(reversed(res["subdir"][0:(n_levels - 1)]))+[basename])) 404 if len(shortened_path) < len(orig_path): 405 return ".../" + shortened_path 406 407 408# _________________________________________________________________________________________ 409# 410# swap_nesting_order 411# 412# _________________________________________________________________________________________ 413def swap_nesting_order(orig_coll): 414 """ 415 Reverse nested order so that coll[3]['a'] becomes coll['a'][3] 416 """ 417 new_dict = defaultdict(dict) 418 new_list = [] 419 for ii, ii_item in enumerate(orig_coll): 420 for jj, value in ii_item.items(): 421 if isinstance(jj, int): 422 # resize 423 new_list += [{}]*(jj + 1 - len(new_list)) 424 new_list[jj][ii] = value 425 else: 426 new_dict[jj][ii] = value 427 return new_list, dict(new_dict) 428 429# _________________________________________________________________________________________ 430# 431# swap_doubly_nested_order 432# 433# _________________________________________________________________________________________ 434 435 436def swap_doubly_nested_order(orig_coll): 437 """ 438 Reverse nested order so that coll[3]['a'] becomes coll['a'][3] 439 """ 440 new_dict = dict() 441 new_list = [] 442 for ii, ii_item in enumerate(orig_coll): 443 for jj, jj_item in enumerate(ii_item): 444 for kk, value in jj_item.items(): 445 if isinstance(kk, int): 446 # resize 447 new_list += [{}]*(kk + 1 - len(new_list)) 448 if ii not in new_list[kk]: 449 new_list[kk][ii] = dict() 450 new_list[kk][ii][jj] = value 451 else: 452 if kk not in new_dict: 453 new_dict[kk] = dict() 454 if ii not in new_dict[kk]: 455 new_dict[kk][ii] = dict() 456 new_dict[kk][ii][jj] = value 457 458 return new_list, new_dict 459 460 461# _________________________________________________________________________________________ 462# 463# regex_matches_as_dict 464# 465# _________________________________________________________________________________________ 466def regex_matches_as_dict(test_str, compiled_regex): 467 """ 468 Returns result of regular expression match in a dictionary 469 combining both named and unnamed captures 470 """ 471 if compiled_regex: 472 if isinstance(compiled_regex, path_str_type): 473 compiled_regex = re.compile(compiled_regex) 474 mm = compiled_regex.search(test_str) 475 # Match failed 476 if mm is None: 477 return False 478 else: 479 # No capture 480 if mm.lastindex is None: 481 return {0: mm.group(0)} 482 # Combined named and unnamed captures 483 else: 484 # no dictionary comprehensions in python 2.6 :-( 485 #matchdicts.append({i : mm.group(i) for i in (range(mm.lastindex) + mm.groupdict().keys())}) 486 # Keys for captures: 487 # 1) unnamed captures = range(mm.lastindex + 1) 488 # 2) named captures = mm.groupdict().keys() 489 return dict((i, mm.group(i)) for i in (chain(iter(range(mm.lastindex + 1)), 490 iter(mm.groupdict().keys())))) 491 492 else: 493 return None 494 495 496# _________________________________________________________________________________________ 497# 498# path_decomposition_regex_match 499# 500# _________________________________________________________________________________________ 501def path_decomposition_regex_match(test_str, compiled_regex): 502 """ 503 Returns a dictionary identifying the components of a file path. 504 505 This includes both the components of a path: 506 basename: (any) base (file) name of the path not including the extension. No slash included 507 ext: (any) extension of the path including the "." 508 path: a list of subpaths created by removing subdirectory names 509 subdir: a list of subdirectory names from the most nested to the root 510 and regular expression matches 511 The keys are the index or name of the capturing group. 512 513 514 If compiled_regexes is not specified, return path decomposition only 515 516 If compiled_regexes is specified, and the corresponding regular expression does not match, 517 the entire match fails 518 519 For example 520 521 path_decomposition_regex_match("/a/b/c/sample1.bam", r"(.*)(?P<id>\d+)\..+") 522 523 { 524 0: '/a/b/c/sample1.bam', // captured by index 525 1: '/a/b/c/sample', // captured by index 526 'id': '1' // captured by name 527 'ext': '.bam', 528 'subdir': ['c', 'b', 'a', '/'], 529 'subpath': ['/a/b/c', '/a/b', '/a', '/'], 530 'path': '/a/b/c', 531 'basename': 'sample1', 532 }, 533 534 path_decomposition_regex_match("dbsnp15.vcf", r"(.*)(?P<id>\d+)\..+") 535 { 536 0: 'dbsnp15.vcf', // captured by index 537 1: 'dbsnp1', // captured by index 538 'id': '5' // captured by name 539 'ext': '.vcf', 540 'subdir': [], 541 'path': [], 542 'basename': 'dbsnp15', 543 }, 544 545 546 // fail 547 path_decomposition_regex_match("/test.txt", r"(.*)(?P<id>\d+)\..+") 548 {} 549 550 // path components only 551 path_decomposition_regex_match("/test.txt", None) 552 { 553 'ext': '.txt', 554 'subdir': ['/'] 555 'subpath': ['/'], 556 'path': '/', 557 'basename': 'test', 558 } 559 560 """ 561 pp = path_decomposition(test_str) 562 563 # regular expression not specified 564 # just path 565 if compiled_regex is None: 566 return pp 567 568 rr = regex_matches_as_dict(test_str, compiled_regex) 569 570 # regular expression match failed 571 # nothing 572 if rr == False: 573 return {} 574 575 # 576 # regular expression matches override file decomposition values in 577 # case of clashes between predefined keys such as "basename" and 578 # regular expression named capture groups 579 # 580 pp.update(rr) 581 return pp 582 583 584# _________________________________________________________________________________________ 585# 586# check_compiled_regexes 587# 588# _________________________________________________________________________________________ 589def check_compiled_regexes(compiled_regexes, expected_num): 590 """ 591 check compiled_regexes are of the right type and number 592 """ 593 if compiled_regexes is None: 594 return [None] * expected_num 595 596 if not isinstance(compiled_regexes, list): 597 raise Exception("Expecting list of None and strings") 598 599 # pad compiled_regexes with None 600 if len(compiled_regexes) < expected_num: 601 compiled_regexes.extend( 602 [None] * (expected_num - len(compiled_regexes))) 603 604 # Turn strings to regular expression just in case 605 # We don't want to do this here because the error messages are not very nice: 606 # There is not much context left 607 compiled_regexes = [re.compile(rr) if isinstance( 608 rr, path_str_type) else rr for rr in compiled_regexes] 609 610 # check types 611 regex_types = type(re.compile("")), type(None) 612 for rr in compiled_regexes: 613 if not isinstance(rr, regex_types): 614 raise Exception( 615 "Unexpected type %s ('%s') specified in regular expression list. Expecting string or compiled regular expression" % (type(rr), rr)) 616 617 return compiled_regexes 618 619 620# _________________________________________________________________________________________ 621# 622# get_all_paths_components 623# 624# _________________________________________________________________________________________ 625def get_all_paths_components(paths, compiled_regexes): 626 """ 627 For each path in a list, 628 If any of the regular expression matches fails, the whole list fails 629 """ 630 # 631 # merge regular expression matches and path decomposition 632 # 633 compiled_regexes = check_compiled_regexes(compiled_regexes, len(paths)) 634 results = [] 635 for (pp, rr) in zip(paths, compiled_regexes): 636 result = path_decomposition_regex_match(pp, rr) 637 if result == {}: 638 return [{}] * len(paths) 639 results.append(result) 640 return results 641 642 643# _________________________________________________________________________________________ 644# 645# apply_func_to_sequence 646# 647# _________________________________________________________________________________________ 648def apply_func_to_sequence(seq, func, tuple_of_conforming_types=(path_str_type,), tuple_of_sequences_types=(list, tuple, set)): 649 """ 650 Recurses into list/tuple/set sequences to apply func to conforming types 651 Non-conforming types are left alone 652 """ 653 if isinstance(seq, tuple_of_conforming_types): 654 return func(seq) 655 elif isinstance(seq, tuple_of_sequences_types): 656 return type(seq)(apply_func_to_sequence(pp, func, tuple_of_conforming_types, tuple_of_sequences_types) for pp in seq) 657 else: 658 return seq 659 660 661# _________________________________________________________________________________________ 662# 663# t_regex_replace 664# 665# _________________________________________________________________________________________ 666class t_regex_replace(object): 667 def __init__(self, filename, regex_str, compiled_regex, regex_or_suffix): 668 self.regex_or_suffix = regex_or_suffix 669 self.compiled_regex = compiled_regex 670 self.regex_str = regex_str 671 self.filename = filename 672 673 def __call__(self, p): 674 # 675 # check if substitution pattern is mis-specified 676 # 677 if "\1"in p or "\2" in p: 678 raise error_unescaped_regular_expression_forms("['%s'] " % (p.replace("\1", r"\1").replace("\2", r"\2")) + 679 "The special regular expression characters " 680 r"\1 and \2 need to be 'escaped' in python. " 681 r"The easiest option is to use python 'raw' strings " 682 r"e.g. r'\1_in_a string\2'. See http://docs.python.org/library/re.html.") 683 # 684 # For suffix(), replaces the suffix part by adding leading r"\1" to the substitution pattern 685 # 686 # If r"\1" is specified, then we presume you know what you are doing... 687 # 688 if self.regex_or_suffix == SUFFIX_SUBSTITUTE: 689 if r"\1" not in p and r"\g<1>" not in p: 690 match_p = r"\g<1>" + p 691 else: 692 match_p = p 693 694 # throw exception if doesn't match regular expression at all 695 (res_str, cnt_replacements) = self.compiled_regex.subn( 696 match_p, self.filename) 697 if cnt_replacements == 0: 698 raise error_input_file_does_not_match( 699 "File '%s' does not match suffix('%s') and pattern '%s'" % (self.filename, self.regex_str, p)) 700 return res_str 701 702 # 703 # Normal substitution 704 # 705 # For suffix(), complete replacement by the specified pattern text 706 # only substitute if r"\1" or r"\g<1>" is specified 707 # 708 # 709 err_str = "" 710 try: 711 (res_str, cnt_replacements) = self.compiled_regex.subn(p, self.filename) 712 if cnt_replacements > 0: 713 return res_str 714 except re.error: 715 exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() 716 err_str = str(exceptionValue) 717 raise fatal_error_input_file_does_not_match( 718 "File '%s' does not match regex('%s') and pattern '%s':\n\t%s\n" % (self.filename, self.regex_str, p, err_str)) 719 except IndexError: 720 exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() 721 err_str = str(exceptionValue) 722 raise fatal_error_input_file_does_not_match( 723 "File '%s' does not match regex('%s') and pattern '%s':\n\t%s\n" % (self.filename, self.regex_str, p, err_str)) 724 725 # except (re.error, IndexError): 726 #err_str = str(sys.exc_info()[1]), 727 728 raise error_input_file_does_not_match("File '%s' does not match regex('%s') and pattern '%s'\n%s\n" % ( 729 self.filename, self.regex_str, p, err_str)) 730 731 732# _________________________________________________________________________________________ 733# 734# raise_formatter_substitution_exception 735# 736# _________________________________________________________________________________________ 737def raise_formatter_substitution_exception(exceptionValue, formatter_str, pattern, filenames, 738 substitutes_list, substitutes_dict): 739 """ 740 Throws an exception when formatter fails to make a substitution 741 """ 742 # convert to string to get just the missing key 743 missing_key = str(exceptionValue.args[0]) 744 # strip quotes 745 if missing_key[0:1] in '\'"' and missing_key[-1:] in '\'"': 746 missing_key = missing_key[1:-1] 747 raise error_input_file_does_not_match("Unmatched field {%s} in '%s' where\n input = %r,\n" 748 " filter = formatter(%s). Possible substitutions= %s, %s." 749 % (missing_key, 750 pattern, 751 filenames, 752 formatter_str, 753 substitutes_list, substitutes_dict)) 754 755 756# _________________________________________________________________________________________ 757# 758# t_formatter_replace 759# 760# _________________________________________________________________________________________ 761class t_formatter_replace(object): 762 def __init__(self, filenames, regex_strings, compiled_regexes=None): 763 self.filenames = filenames 764 # get the full absolute, normalised paths 765 filenames = [os.path.abspath(f) for f in filenames] 766 self.path_regex_components = get_all_paths_components( 767 filenames, compiled_regexes) 768 self.display_regex_strings = parameter_list_as_string(regex_strings) 769 770 def __call__(self, pattern): 771 # swapped nesting order makes the syntax easier to explain: 772 # The first level of indirection is always the path component 773 # So basename[0] is the file name for the first file 774 # This looks better than the normal 0[basename] 775 776 # some contortions because format decodes {0} as an offset into a list and not not a lookup into a dict... 777 substitutes_list, substitutes_dict = swap_nesting_order( 778 self.path_regex_components) 779 780 try: 781 return pattern.format(*substitutes_list, **substitutes_dict) 782 except (KeyError, IndexError): 783 raise_formatter_substitution_exception(sys.exc_info()[1], self.display_regex_strings, 784 pattern, self.filenames, 785 substitutes_list, substitutes_dict) 786 787 788# _________________________________________________________________________________________ 789# 790# t_nested_formatter_replace 791# 792# _________________________________________________________________________________________ 793class t_nested_formatter_replace(object): 794 """ 795 Like t_formatter_replace but with one additional level of nesting 796 I.e. everything is a list comprehension! 797 For combinatorics @decorators 798 """ 799 800 def __init__(self, filenames, regex_strings, compiled_regexes): 801 # make sure that we have the same level of nestedness for regular expressions and file names etc. 802 if len(filenames) != len(regex_strings) or len(filenames) != len(compiled_regexes): 803 raise Exception("Logic Error.") 804 self.filenames = filenames 805 # get the full absolute, normalised paths 806 filenames = [[os.path.abspath(f) for f in filegroups] 807 for filegroups in filenames] 808 self.path_regex_components = [get_all_paths_components( 809 f, r) for (f, r) in zip(filenames, compiled_regexes)] 810 self.display_regex_strs = [ 811 parameter_list_as_string(ss) for ss in regex_strings] 812 813 def __call__(self, pattern): 814 # swapped nesting order makes the syntax easier to explain: 815 # The first level of indirection is always the path component 816 # So basename[0] is the file name for the first file 817 # This looks better than the normal 0[basename] 818 819 # some contortions because format decodes {0} as an offset into a list and not not a lookup into a dict... 820 substitutes_list, substitutes_dict = swap_doubly_nested_order( 821 self.path_regex_components) 822 try: 823 return pattern.format(*substitutes_list, **substitutes_dict) 824 except (KeyError, IndexError): 825 formatter_str = ", ".join("formatter(%s)" % 826 ss for ss in self.display_regex_strs) 827 raise_formatter_substitution_exception(sys.exc_info()[1], formatter_str, pattern, self.filenames, 828 substitutes_list, substitutes_dict) 829 830# _________________________________________________________________________________________ 831# 832# t_nested_string_replace 833# 834# _________________________________________________________________________________________ 835 836 837class t_nested_string_replace(object): 838 """ 839 Replaces path with directory 840 """ 841 842 def __init__(self, prev_str, new_str): 843 self.prev_str = prev_str 844 self.new_str = new_str 845 846 def __call__(self, p): 847 return p.replace(prev_str, new_str) 848 849 850# _________________________________________________________________________________________ 851# 852# regex_replace 853# 854# _________________________________________________________________________________________ 855 856# 857# Perform normal regular expression substitution 858# 859REGEX_SUBSTITUTE = 0 860# 861# An extra peculiar mode to help suffix along: 862# Suffix regular expression have an implicit capture for everything up to the specified 863# suffix text 864 865# 866# By default, replaces the suffix part by adding leading r"\1" to the substitution pattern 867# If r"\1" is already specified in the pattern, then we presume you know what 868# you are doing, and will let you get along with it 869# 870SUFFIX_SUBSTITUTE = 1 871 872# 873# REGEX_SUBSTITUTE is used for suffix() matches in 'extra' arguments (additional to output) 874# which are strings 875# 876# Complete replacement happens. If you wish to retain the prefix text 877# before the suffix, you can do so by adding r"\1" 878# 879 880 881def regex_replace(filename, regex_str, compiled_regex, substitution_patterns, regex_or_suffix=REGEX_SUBSTITUTE): 882 return apply_func_to_sequence(substitution_patterns, t_regex_replace(filename, regex_str, compiled_regex, regex_or_suffix)) 883 884 885def formatter_replace(filenames, regex_str, compiled_regex, substitution_patterns): 886 return apply_func_to_sequence(substitution_patterns, t_formatter_replace(filenames, regex_str, compiled_regex)) 887 888 889def nested_formatter_replace(filenames, regex_strings, compiled_regexes, substitution_patterns): 890 return apply_func_to_sequence(substitution_patterns, t_nested_formatter_replace(filenames, regex_strings, compiled_regexes)) 891 892 893def nested_string_replace(prev_str, new_str, substitution_patterns): 894 return apply_func_to_sequence(substitution_patterns, t_nested_string_replace(prev_str, new_str)) 895 896 897# _________________________________________________________________________________________ 898 899# non_str_sequence 900 901# _________________________________________________________________________________________ 902def non_str_sequence(arg): 903 """ 904 Whether arg is a sequence. 905 We treat strings / dicts however as a singleton not as a sequence 906 907 """ 908 # will only dive into list and set, everything else is not regarded as a sequence 909 # loss of flexibility but more conservative 910 # if (isinstance(arg, (basestring, dict, multiprocessing.managers.DictProxy))): 911 if (not isinstance(arg, (list, tuple, set))): 912 return False 913 try: 914 test = iter(arg) 915 return True 916 except TypeError: 917 return False 918 919# _________________________________________________________________________________________ 920 921# get_strings_in_flattened_sequence_aux 922 923# helper function for next function 924 925# _________________________________________________________________________________________ 926 927 928def get_strings_in_flattened_sequence_aux(p, l=None): 929 """ 930 Unravels arbitrarily nested sequence and returns lists of strings 931 """ 932 if l is None: 933 l = [] 934 if isinstance(p, path_str_type): 935 l.append(p) 936 elif non_str_sequence(p): 937 for pp in p: 938 get_strings_in_flattened_sequence_aux(pp, l) 939 return l 940 941 942# _________________________________________________________________________________________ 943 944# non_str_sequence 945 946# _________________________________________________________________________________________ 947def get_strings_in_flattened_sequence(p): 948 """ 949 Traverses nested sequence and for each element, returns first string encountered 950 """ 951 if p is None: 952 return [] 953 954 # 955 # string is returned as list of single string 956 # 957 if isinstance(p, path_str_type): 958 return [p] 959 960 # 961 # Get all strings flattened into list 962 # 963 return get_strings_in_flattened_sequence_aux(p) 964 965 966# _________________________________________________________________________________________ 967 968# get_first_string_in_nested_sequence 969 970# _________________________________________________________________________________________ 971def get_first_string_in_nested_sequence(p): 972 strings = get_strings_in_flattened_sequence(p) 973 if len(strings): 974 return strings[0] 975 return None 976 977 978# 979# TODOOO third object could be a dict or a list 980# 981# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 982 983# Encoders: turn objects and filenames into a more presentable format 984 985# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 986def ignore_unknown_encoder(obj): 987 if non_str_sequence(obj): 988 return "[%s]" % ", ".join(map(ignore_unknown_encoder, obj)) 989 try: 990 s = str(obj) 991 if " object" in s and s[0] == '<' and s[-1] == '>': 992 pos = s.find(" object") 993 s = "<" + s[1:pos].replace("__main__.", "") + ">" 994 return s.replace('"', "'") 995 except: 996 return "<%s>" % str(obj.__class__).replace('"', "'") 997 998# _________________________________________________________________________________________ 999# 1000# shorten_filenames_encoder 1001# ________________________________________________________________________________________ 1002 1003 1004def shorten_filenames_encoder(obj, n_levels=2): 1005 """ 1006 Convert a set of parameters into a string 1007 Paths with > N levels of nested-ness are truncated 1008 """ 1009 1010 # 1011 # if < 0, nest by 2 1012 # 1013 if n_levels < 0: 1014 desired_len = -n_levels 1015 prev_encoded_len = 0 1016 # 1017 # try more and more nestedness up to 9 if that fits inside desired length 1018 # stop when increasing nestedness makes no difference 1019 # 1020 for nestedness in range(1, 20): 1021 res = shorten_filenames_encoder(obj, nestedness) 1022 if len(res) > desired_len or "..." not in res: 1023 break 1024 prev_encoded_len = len(res) 1025 desired_len = max(4, desired_len - 5) 1026 offset = len(res) - desired_len 1027 if offset < 0: 1028 return res 1029 return "<???> " + res[offset:] 1030 1031 # 1032 # Recurse into lists and tuples 1033 # 1034 if non_str_sequence(obj): 1035 return "[%s]" % ", ".join(map(shorten_filenames_encoder, obj, [n_levels] * len(obj))) 1036 1037 # 1038 # Only shorten strings 1039 # 1040 if not isinstance(obj, path_str_type): 1041 return ignore_unknown_encoder(obj) 1042 1043 # 1044 # level = 0 means return full absolute path 1045 # 1046 if not n_levels: 1047 return ignore_unknown_encoder(os.path.abspath(obj)) 1048 1049 # 1050 # Shorten both relative and absolute (full) paths 1051 # 1052 1053 # if within bounds, return that 1054 if obj[1:].count('/') < n_levels: 1055 return ignore_unknown_encoder(obj) 1056 1057 # use relative path if that has <= nested level 1058 rel_path = os.path.relpath(obj) 1059 if rel_path.count('/') <= n_levels: 1060 #print >>sys.stderr, "relative path only one nested level" 1061 return ignore_unknown_encoder(rel_path) 1062 1063 # get last N nested levels 1064 # print >>sys.stderr, "full path last N nested level" 1065 return ignore_unknown_encoder(get_nth_nested_level_of_path(obj, n_levels)) 1066 1067 1068# _________________________________________________________________________________________ 1069# 1070# get_tasks_filename_globs_in_nested_sequence 1071# 1072# ________________________________________________________________________________________ 1073glob_letters = set('*[]?') 1074 1075 1076def is_glob(s): 1077 """Check whether 's' contains ANY of glob chars""" 1078 return len(glob_letters.intersection(s)) > 0 1079 1080 1081# _________________________________________________________________________________________ 1082# 1083# get_nested_tasks_or_globs 1084# 1085# ________________________________________________________________________________________ 1086def get_nested_tasks_or_globs(p, treat_strings_as_tasks=False, runtime_data_names=None, tasks=None, globs=None): 1087 """ 1088 Get any tasks or globs which are within parameter 1089 tasks are returned as functions or function names 1090 """ 1091 # 1092 # create storage if this is not a recursive call 1093 # 1094 if globs is None: 1095 runtime_data_names, tasks, globs = set(), list(), set() 1096 1097 # 1098 # task function 1099 # 1100 if isinstance(p, Callable): 1101 tasks.append(p) 1102 elif p.__class__.__name__ == 'Task' or p.__class__.__name__ == 'Pipeline': 1103 tasks.append(p) 1104 elif isinstance(p, runtime_parameter): 1105 runtime_data_names.add(p) 1106 1107 # 1108 # output_from treats all arguments as tasks or task names 1109 # 1110 elif isinstance(p, output_from): 1111 for pp in p.args: 1112 get_nested_tasks_or_globs( 1113 pp, True, runtime_data_names, tasks, globs) 1114 1115 elif isinstance(p, path_str_type): 1116 if treat_strings_as_tasks: 1117 tasks.append(p) 1118 elif is_glob(p): 1119 globs.add(p) 1120 1121 elif non_str_sequence(p): 1122 for pp in p: 1123 get_nested_tasks_or_globs( 1124 pp, treat_strings_as_tasks, runtime_data_names, tasks, globs) 1125 return tasks, globs, runtime_data_names 1126 1127# _________________________________________________________________________________________ 1128# 1129# replace_placeholders_with_tasks_in_input_params 1130# 1131# ________________________________________________________________________________________ 1132 1133 1134def replace_placeholders_with_tasks_in_input_params(p, func_or_name_to_task, treat_strings_as_tasks=False): 1135 """ 1136 Replaces task functions or task name (strings) with the tasks they represent 1137 Also replaces Tasks and Pipelines with the correct Tasks 1138 func_or_name_to_task are a dictionary of function and task names to tasks 1139 1140 """ 1141 if p.__class__.__name__ == 'Pipeline': 1142 return func_or_name_to_task["PIPELINE=%s=PIPELINE" % p.name] 1143 1144 if p.__class__.__name__ == 'Task' and p in func_or_name_to_task: 1145 return func_or_name_to_task[p] 1146 1147 # 1148 # Expand globs or tasks as a list only if they are top level 1149 # 1150 if isinstance(p, Callable): 1151 # if type(p) == types.FunctionType: 1152 return func_or_name_to_task[p] 1153 1154 # 1155 # output_from treats all arguments as tasks or task names 1156 # 1157 if isinstance(p, output_from): 1158 if len(p.args) == 1: 1159 return replace_placeholders_with_tasks_in_input_params(p.args[0], func_or_name_to_task, True) 1160 else: 1161 return [replace_placeholders_with_tasks_in_input_params(pp, func_or_name_to_task, True) for pp in p.args] 1162 1163 # 1164 # strings become tasks if treat_strings_as_tasks 1165 # 1166 if isinstance(p, path_str_type): 1167 if treat_strings_as_tasks: 1168 return func_or_name_to_task[p] 1169 return p 1170 1171 # 1172 # No conversions within dictionaries 1173 # 1174 if isinstance(p, dict): 1175 return p 1176 1177 # 1178 # Other sequences are recursed down 1179 # 1180 elif non_str_sequence(p): 1181 l = list() 1182 for pp in p: 1183 1184 # 1185 # To be intuitive: 1186 # arguments wrapped by output_from are always treated "in-line" 1187 # e.g. 1, output_from("a") => 1, task_a 1188 # e.g. 1, output_from("a", 2) => 1, task_a, 2 1189 # 1190 if isinstance(pp, output_from): 1191 if len(pp.args) > 1: 1192 l.extend(tuple(replace_placeholders_with_tasks_in_input_params( 1193 pp, func_or_name_to_task, True))) 1194 elif len(pp.args) == 1: 1195 l.append(replace_placeholders_with_tasks_in_input_params( 1196 pp.args[0], func_or_name_to_task, True)) 1197 # else len(pp.args) == 0 !! do nothing 1198 1199 else: 1200 l.append(replace_placeholders_with_tasks_in_input_params( 1201 pp, func_or_name_to_task, treat_strings_as_tasks)) 1202 return type(p)(l) 1203 1204 # 1205 # No expansions of non-string/non-sequences 1206 # 1207 else: 1208 return p 1209 1210# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 1211 1212# compiling regular expressions 1213 1214# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 1215# _________________________________________________________________________________________ 1216 1217# suffix 1218 1219# _________________________________________________________________________________________ 1220 1221 1222class suffix(object): 1223 def __init__(self, *args): 1224 self.args = args 1225 1226 def __repr__(self, *args): 1227 return 'suffix%r' % (self.args,) 1228 1229# _________________________________________________________________________________________ 1230 1231# regex 1232 1233# _________________________________________________________________________________________ 1234 1235 1236class regex(object): 1237 def __init__(self, *args): 1238 self.args = args 1239 1240 def __repr__(self, *args): 1241 return 'regex%r' % (self.args,) 1242 1243# _________________________________________________________________________________________ 1244 1245# regex 1246 1247# _________________________________________________________________________________________ 1248 1249 1250class formatter(object): 1251 def __init__(self, *args): 1252 self.args = args 1253 1254 def __repr__(self, *args): 1255 return 'formatter%r' % (self.args,) 1256 1257# _________________________________________________________________________________________ 1258 1259# wrap_exception_as_string 1260 1261# _________________________________________________________________________________________ 1262 1263 1264def wrap_exception_as_string(): 1265 """ 1266 return exception as string to be rethrown 1267 """ 1268 exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() 1269 msg = "%s.%s" % (exceptionType.__module__, exceptionType.__name__) 1270 exception_value = str(exceptionValue) 1271 if len(exception_value): 1272 return msg + ": (%s)" % exception_value 1273 return msg 1274 1275 1276# _________________________________________________________________________________________ 1277 1278# compile_formatter 1279 1280# _________________________________________________________________________________________ 1281def compile_formatter(enclosing_task, formatter_obj, error_object, descriptor_string): 1282 """ 1283 Given list of [string|None] 1284 Return compiled regular expressions. 1285 """ 1286 1287 compiled_regexes = [] 1288 for ss in formatter_obj.args: 1289 # ignore None 1290 if ss is None: 1291 compiled_regexes.append(None) 1292 continue 1293 1294 formatter_args = str(formatter_obj.args)[1:-1] 1295 # regular expression should be strings 1296 if not isinstance(ss, path_str_type): 1297 raise error_object(enclosing_task, ("{descriptor_string}: " 1298 "formatter({formatter_args}) is malformed\n" 1299 "formatter(...) should only be used to wrap " 1300 'regular expression strings or None (not "{ss}")') 1301 .format(descriptor_string=descriptor_string, 1302 formatter_args=formatter_args, 1303 ss=ss) 1304 ) 1305 1306 try: 1307 compiled_regexes.append(re.compile(ss)) 1308 except: 1309 raise error_object(enclosing_task, ("{descriptor_string}: " 1310 "in formatter({formatter_args}) \n" 1311 'regular expression "{ss}" is malformed\n' 1312 "[{except_str}]") 1313 .format(descriptor_string=descriptor_string, 1314 formatter_args=formatter_args, 1315 ss=ss, 1316 except_str=wrap_exception_as_string()) 1317 ) 1318 return compiled_regexes 1319 1320 1321# _________________________________________________________________________________________ 1322 1323# compile_regex 1324 1325# _________________________________________________________________________________________ 1326def compile_regex(enclosing_task, regex, error_object, descriptor_string, regex_object_name="regex"): 1327 """ 1328 throw error unless regular expression compiles 1329 """ 1330 if not len(regex.args) or len(regex.args) > 1 or not isinstance(regex.args[0], path_str_type): 1331 regex_str = str(regex.args) 1332 if len(regex.args) > 1: 1333 regex_str = regex_str[1:-1] 1334 elif len(regex.args) == 0: 1335 regex_str = '' 1336 else: 1337 regex_str = regex_str 1338 raise error_object(enclosing_task, ("{descriptor_string}: " 1339 "{regex_object_name}({regex_str}) is malformed\n" 1340 "{regex_object_name}(...) should only be used to wrap a single regular expression string") 1341 .format(descriptor_string=descriptor_string, 1342 regex_str=regex_str, 1343 regex_object_name=regex_object_name) 1344 ) 1345 try: 1346 matching_regex = re.compile(regex.args[0]) 1347 return matching_regex 1348 except: 1349 raise error_object(enclosing_task, ("{descriptor_string}: " 1350 "regular expression {regex_object_name}('{regex_str}') is malformed\n" 1351 "[{except_str}]") 1352 .format(descriptor_string=descriptor_string, 1353 regex_object_name=regex_object_name, 1354 regex_str=regex.args[0], 1355 except_str=wrap_exception_as_string()) 1356 ) 1357 1358# _________________________________________________________________________________________ 1359 1360# compile_suffix 1361 1362# _________________________________________________________________________________________ 1363 1364 1365def compile_suffix(enclosing_task, regex, error_object, descriptor_string): 1366 """ 1367 throw error unless regular expression compiles 1368 """ 1369 if not len(regex.args): 1370 raise error_object(enclosing_task, "%s: " % descriptor_string + 1371 "suffix() is malformed.\n" + 1372 "suffix(...) should be used to wrap a string matching the suffices of file names") 1373 if len(regex.args) > 1 or not isinstance(regex.args[0], path_str_type): 1374 raise error_object(enclosing_task, "%s: " % descriptor_string + 1375 "suffix('%s') is malformed.\n" % (regex.args,) + 1376 "suffix(...) should only be used to wrap a single string matching the suffices of file names") 1377 try: 1378 matching_regex = re.compile("(.*)" + re.escape(regex.args[0]) + "$") 1379 return matching_regex 1380 except: 1381 raise error_object(enclosing_task, "%s: " % descriptor_string + 1382 "suffix('%s') is somehow malformed\n" % regex.args[0] + 1383 "[%s]" % wrap_exception_as_string()) 1384 1385# _________________________________________________________________________________________ 1386 1387# check_parallel_parameters 1388 1389# _________________________________________________________________________________________ 1390 1391 1392def check_parallel_parameters(enclosing_task, params, error_object): 1393 """ 1394 Helper function for @files 1395 Checks format of parameters and 1396 whether there are input and output files specified for each job 1397 """ 1398 if not len(params): 1399 raise Exception("@parallel parameters is empty.") 1400 1401 for job_param in params: 1402 if isinstance(job_param, path_str_type): 1403 message = ("Wrong syntax for @parallel.\n" 1404 "@parallel(%s)\n" % ignore_unknown_encoder(params) + 1405 "If you are supplying parameters for a task " 1406 "running as a single job, " 1407 "either don't put enclosing brackets at all (with each parameter " 1408 "separated by commas) or enclose all parameters as a nested list of " 1409 "lists, e.g. [['input', 'output' ...]]. " 1410 ) 1411 raise error_object(enclosing_task, message) 1412 1413 1414# _________________________________________________________________________________________ 1415 1416# check_files_io_parameters 1417 1418# _________________________________________________________________________________________ 1419def check_files_io_parameters(enclosing_task, params, error_object): 1420 """ 1421 Helper function for @files 1422 Checks format of parameters and 1423 whether there are input and output files specified for each job 1424 """ 1425 # if not len(params): 1426 # raise Exception("@files I/O parameters is empty.") 1427 1428 try: 1429 for job_param in params: 1430 if isinstance(job_param, path_str_type): 1431 raise TypeError 1432 1433 if len(job_param) < 1: 1434 raise error_object(enclosing_task, "Missing input files for job " + 1435 ignore_unknown_encoder(job_param)) 1436 if len(job_param) < 2: 1437 raise error_object(enclosing_task, "Missing output files for job " + 1438 ignore_unknown_encoder(job_param)) 1439 # if len(get_strings_in_flattened_sequence(job_param[0:2])) == 0: 1440 # raise error_object(enclosing_task, "Input or output file parameters should " 1441 # "contain at least one or more file names strings. " 1442 # "Consider using @parallel if you are not using files. " + 1443 # ignore_unknown_encoder(job_param)) 1444 except TypeError: 1445 # 1446 # job_param was not a list 1447 # 1448 message = ("Wrong syntax for @files.\n@files(%s)\n" % ignore_unknown_encoder(params) + 1449 "If you are supplying parameters for a task " 1450 "running as a single job, " 1451 "either don't put enclosing brackets at all (with each parameter " 1452 "separated by commas) or enclose all parameters as a nested list of " 1453 "lists, e.g. [['input', 'output' ...]]. " 1454 ) 1455 raise error_object(enclosing_task, message) 1456 1457# _________________________________________________________________________________________ 1458# 1459# expand_nested_tasks_or_globs 1460# 1461# ________________________________________________________________________________________ 1462 1463 1464def expand_nested_tasks_or_globs(p, tasksglobs_to_filenames): 1465 """ 1466 Expand globs and tasks "in-line", unless they are the top level, in which case turn 1467 it into a list 1468 1469 N.B. Globs are only expanded if they are in tasksglobs_to_filenames 1470 This function is called for @split descriptors which leave output globs untouched 1471 for clarity. Thanks to Noah Spies for spotting this! 1472 """ 1473 1474 # 1475 # Expand globs or tasks as a list only if they are top level 1476 # 1477 if ((isinstance(p, path_str_type) and is_glob(p) and p in tasksglobs_to_filenames) or 1478 p.__class__.__name__ == 'Task' or 1479 isinstance(p, runtime_parameter)): 1480 return tasksglobs_to_filenames[p] 1481 1482 # 1483 # No expansions of strings and dictionaries 1484 # 1485 if isinstance(p, (path_str_type, dict)): 1486 return p 1487 1488 # 1489 # Other sequences are recursed down 1490 # 1491 elif non_str_sequence(p): 1492 l = list() 1493 for pp in p: 1494 if (isinstance(pp, path_str_type) and pp in tasksglobs_to_filenames): 1495 l.extend(tasksglobs_to_filenames[pp]) 1496 elif pp.__class__.__name__ == 'Task' or isinstance(pp, runtime_parameter): 1497 files = tasksglobs_to_filenames[pp] 1498 # task may have produced a single output: in which case append 1499 if non_str_sequence(files): 1500 l.extend(files) 1501 else: 1502 l.append(files) 1503 else: 1504 l.append(expand_nested_tasks_or_globs( 1505 pp, tasksglobs_to_filenames)) 1506 return type(p)(l) 1507 1508 # 1509 # No expansions of non-string/non-sequences 1510 # 1511 else: 1512 return p 1513 1514 1515# _________________________________________________________________________________________ 1516 1517# get_parsed_arguments_str_for_errors 1518 1519# helper funciton for parse_task_arguments() 1520 1521# _________________________________________________________________________________________ 1522def get_parsed_arguments_str_for_errors(task_description, bad_arg_str, unnamed_result_strs, named_result_strs): 1523 """ 1524 Helper function for parse_task_arguments 1525 Prints out offending argument (bad_arg_str) in the context of already parsed 1526 arguments so that we can quickly figure out where the error is coming from 1527 """ 1528 indent = task_description.find("(") + 1 1529 parsed_arg_str = ", ".join(unnamed_result_strs + named_result_strs) 1530 # make function names clearer in arg list 1531 parsed_arg_str = re.sub( 1532 r"<function (\w+) at 0x[0-9a-f]+>", r"\1", parsed_arg_str) 1533 return "\n" + task_description % (parsed_arg_str + ", ...\n" + 1534 # mark out problem 1535 (" " * (indent-5 if indent - 5 > 0 else 0)) + "===> " + 1536 bad_arg_str) 1537 1538 1539# _________________________________________________________________________________________ 1540 1541# parse_task_arguments 1542 1543# _________________________________________________________________________________________ 1544def parse_task_arguments(orig_unnamed_arguments, orig_named_arguments, expected_arguments, task_description): 1545 """ 1546 Parse arguments parsed into decorators or Pipeline.transform etc. 1547 Special handling for optional arguments in the middle of argument list 1548 1) @product 1549 can have (input, filter, input1, filter1, input2, filter2....) 1550 2) @transform, @subdivide, @collate, @product, @combinatorics which have 1551 (..., [add_inputs(...)|inputs(...)],...) 1552 or ([add_inputs=...|replace_inputs=...]) 1553 or ([add_inputs=add_inputs(...)|replace_inputs=inputs(...)]) 1554 Special handling for variable number of arguments at the end of the argument list 1555 which all become "extras" 1556 1557 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1558 # 1559 # N.B. Missing non-mandatory arguments are returned as an empty list 1560 # 1561 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1562 1563 """ 1564 results = {} 1565 unnamed_arguments = list(orig_unnamed_arguments) 1566 named_arguments = dict(orig_named_arguments) 1567 # parsed results in string form for error messages 1568 unnamed_result_strs = [] 1569 named_result_strs = [] 1570 1571 def parse_add_inputs_args(parsed_arg, input_type, arg_name, modify_inputs_mode, result_strs): 1572 """ 1573 Parse arguments for add_inputs and replace_inputs, i.e. 'inputs()' and 'add_inputs()' 1574 input_type =inputs|add_inputs 1575 arg_name = replace_inputs | add_inputs 1576 modify_inputs_mode = t_extra_inputs.REPLACE_INPUTS| t_extra_inputs.ADD_TO_INPUTS 1577 """ 1578 results["modify_inputs_mode"] = modify_inputs_mode 1579 if input_type == inputs: 1580 # inputs() only takes a single argument. Throw error otherwise 1581 if len(parsed_arg.args) != 1: 1582 err_msg = "inputs() expects a single argument:\n%s" % ( 1583 get_parsed_arguments_str_for_errors(task_description, # bad arg in context of parsed 1584 "%s%r" % (input_type.__name__, tuple( 1585 parsed_arg.args)), 1586 unnamed_result_strs, 1587 named_result_strs)) 1588 raise error_inputs_multiple_args(err_msg) 1589 # unpack add_inputs / inputs and save results 1590 results["modify_inputs"] = parsed_arg.args[0] 1591 else: 1592 results["modify_inputs"] = parsed_arg.args 1593 result_strs.append("%s=%r" % (arg_name, parsed_arg.args)) 1594 1595 def check_argument_type(arg_name, parsed_arg, argument_types): 1596 """ 1597 check if parsed_arg is right type 1598 """ 1599 if argument_types and not isinstance(parsed_arg, argument_types): 1600 err_msg = ("The '%s' argument should be %s:\n%s" % 1601 (arg_name, # argument name 1602 # type names 1603 " or ".join("%s" % 1604 tt.__name__ for tt in argument_types), 1605 get_parsed_arguments_str_for_errors(task_description, # bad arg in context of parsed 1606 "%s = %r" % ( 1607 arg_name, parsed_arg), 1608 unnamed_result_strs, named_result_strs))) 1609 #print (err_msg, file=sys.stderr) 1610 raise TypeError(err_msg) 1611 1612 return parsed_arg 1613 1614 def parse_argument(arg_name, expected_arguments, unnamed_arguments, named_arguments, 1615 results, task_description, mandatory, argument_types=None): 1616 """ 1617 All missing, non-mandatory are empty list 1618 """ 1619 1620 # ignore if not on list 1621 if not arg_name in expected_arguments: 1622 return 1623 1624 # 1625 # look among unnamed arguments first 1626 # 1627 if len(unnamed_arguments): 1628 # check correct type 1629 parsed_arg = check_argument_type( 1630 arg_name, unnamed_arguments[0], argument_types) 1631 # save parsed results 1632 results[arg_name] = parsed_arg 1633 unnamed_result_strs.append("%s=%r" % (arg_name, parsed_arg)) 1634 del unnamed_arguments[0] 1635 1636 # 1637 # then named 1638 # 1639 elif arg_name in named_arguments: 1640 # 1641 # check correct type 1642 # 1643 parsed_arg = check_argument_type( 1644 arg_name, named_arguments[arg_name], argument_types) 1645 # 1646 # Save parsed results 1647 # 1648 results[arg_name] = parsed_arg 1649 named_result_strs.append("%s=%r" % (arg_name, parsed_arg)) 1650 del named_arguments[arg_name] 1651 1652 # 1653 # complain or ignore missing? 1654 # 1655 else: 1656 if mandatory: 1657 err_msg = "Missing '%s' argument:\n%s" % ( 1658 arg_name, # argument name 1659 get_parsed_arguments_str_for_errors(task_description, # bad arg in 1660 arg_name + " = ???", # context of parsed 1661 unnamed_result_strs, 1662 named_result_strs)) 1663 #print (err_msg, file=sys.stderr) 1664 raise error_missing_args(err_msg) 1665 1666 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1667 # 1668 # N.B. Missing non-mandatory arguments are returned as an empty list 1669 # 1670 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1671 else: 1672 results[arg_name] = [] 1673 1674 # 1675 # Missing input is empty list 1676 # 1677 parse_argument('input', expected_arguments, unnamed_arguments, 1678 named_arguments, results, task_description, mandatory=False) 1679 1680 # 1681 # filter is mandatory if expected 1682 # 1683 parse_argument('filter', expected_arguments, unnamed_arguments, named_arguments, results, 1684 task_description, mandatory=True, argument_types=(formatter, regex, suffix)) 1685 1686 if "filter" in results: 1687 if isinstance(results["filter"], suffix): 1688 parse_argument("output_dir", expected_arguments, [ 1689 ], named_arguments, results, task_description, mandatory=False) 1690 1691 # 1692 # inputN 1693 # 1694 if 'inputN' in expected_arguments: 1695 # 1696 # put already parsed input and filter into the list 1697 # 1698 results["input"] = [results["input"]] 1699 results["filter"] = [results["filter"]] 1700 cnt_inputN = 1 1701 # 1702 # parse argument pairs at a time, so long as the second argument is a formatter 1703 # 1704 while len(unnamed_arguments) >= 2 and isinstance(unnamed_arguments[1], formatter): 1705 filter_name = "filter%d" % (cnt_inputN + 1) 1706 input_name = "input%d" % (cnt_inputN + 1) 1707 unnamed_result_strs.append("%s=%r" % ( 1708 input_name, unnamed_arguments[0])) 1709 unnamed_result_strs.append("%s=%r" % ( 1710 filter_name, unnamed_arguments[1])) 1711 results["input"].append(unnamed_arguments[0]) 1712 results["filter"].append(unnamed_arguments[1]) 1713 cnt_inputN += 1 1714 del unnamed_arguments[0:2] 1715 1716 # 1717 # parse named arguments while there is a filter2, filter3 etc. 1718 # 1719 filter_name = "filter%d" % (cnt_inputN + 1) 1720 input_name = "input%d" % (cnt_inputN + 1) 1721 while filter_name in named_arguments: 1722 results["filter"].append(named_arguments[filter_name]) 1723 named_result_strs.append("%s=%r" % ( 1724 filter_name, named_arguments[filter_name])) 1725 del named_arguments[filter_name] 1726 # parse input2, input3 or leave blank as empty list 1727 if input_name in named_arguments: 1728 results["input"].append(named_arguments[input_name]) 1729 named_result_strs.append("%s=%r" % ( 1730 input_name, named_arguments[input_name])) 1731 del named_arguments[input_name] 1732 else: 1733 results["input"].append([]) 1734 cnt_inputN += 1 1735 filter_name = "filter%d" % (cnt_inputN + 1) 1736 input_name = "input%d" % (cnt_inputN + 1) 1737 1738 # 1739 # tuple size is int and mandatory if exists 1740 # 1741 parse_argument('tuple_size', expected_arguments, unnamed_arguments, named_arguments, 1742 results, task_description, mandatory=True, argument_types=(int,)) 1743 1744 # 1745 # add_inputs / inputs are optional 1746 # 1747 if 'modify_inputs' in expected_arguments: 1748 results["modify_inputs_mode"] = t_extra_inputs.KEEP_INPUTS 1749 results["modify_inputs"] = None 1750 parse_add_inputs = ((inputs, "inputs", "replace_inputs", t_extra_inputs.REPLACE_INPUTS), 1751 (add_inputs, "add_inputs", "add_inputs", t_extra_inputs.ADD_TO_INPUTS)) 1752 1753 if len(unnamed_arguments): 1754 # 1755 # Is add_inputs or inputs in unnamed arguments? 1756 # Parse out contents and save in results["replace_inputs"] or results["add_inputs"] 1757 # 1758 for input_type, input_type_name, arg_name, modify_inputs_mode in parse_add_inputs: 1759 parsed_arg = unnamed_arguments[0] 1760 if isinstance(parsed_arg, input_type): 1761 parse_add_inputs_args( 1762 parsed_arg, input_type, arg_name, modify_inputs_mode, unnamed_result_strs) 1763 del unnamed_arguments[0] 1764 break 1765 # 1766 # Otherwise is add_inputs or inputs in named arguments? 1767 # Parse out contents only if necessary and save in results["replace_inputs"] or results["add_inputs"] 1768 # 1769 if results["modify_inputs_mode"] == t_extra_inputs.KEEP_INPUTS: 1770 for input_type, input_type_name, arg_name, modify_inputs_mode in parse_add_inputs: 1771 if arg_name in named_arguments: 1772 parsed_arg = named_arguments[arg_name] 1773 if isinstance(parsed_arg, input_type): 1774 parse_add_inputs_args( 1775 parsed_arg, input_type, arg_name, modify_inputs_mode, named_result_strs) 1776 else: 1777 results["modify_inputs"] = parsed_arg 1778 results["modify_inputs_mode"] = modify_inputs_mode 1779 del named_arguments[arg_name] 1780 break 1781 1782 # 1783 # output is mandatory if exists 1784 # 1785 parse_argument('output', expected_arguments, unnamed_arguments, 1786 named_arguments, results, task_description, mandatory=True) 1787 1788 # 1789 # extras is mandatory if exists 1790 # 1791 if 'extras' in expected_arguments: 1792 results['extras'] = [] 1793 results['named_extras'] = {} 1794 if len(unnamed_arguments): 1795 # move list to results: remember python does shallow copies of lists 1796 results['extras'] = unnamed_arguments 1797 unnamed_result_strs.append("%s=%r" % ("extras", unnamed_arguments)) 1798 unnamed_arguments = [] 1799 #del unnamed_arguments[:] 1800 elif 'extras' in named_arguments: 1801 # Named extras only 1802 if isinstance(named_arguments['extras'], dict): 1803 results["named_extras"] = named_arguments['extras'] 1804 # Unnamed extras only 1805 elif isinstance(named_arguments['extras'], list): 1806 results["extras"] = named_arguments['extras'] 1807 # Wrong type: blow up 1808 else: 1809 err_msg = ("The extras paramter must be either a list of values\nor a dictionary of named parameter values:\n%s" % 1810 get_parsed_arguments_str_for_errors(task_description, 1811 "extras=%r" % ( 1812 named_arguments['extras'],), 1813 unnamed_result_strs, 1814 named_result_strs)) 1815 raise error_extras_wrong_type(err_msg) 1816 1817 named_result_strs.append("%s=%r" % ( 1818 "extras", named_arguments['extras'])) 1819 del named_arguments['extras'] 1820 1821 if len(unnamed_arguments): 1822 err_msg = ("Too many unnamed arguments leftover:\n%s" % 1823 get_parsed_arguments_str_for_errors(task_description, # bad arg in context of parsed 1824 (", ".join(("%r" % a) 1825 for a in unnamed_arguments)), 1826 unnamed_result_strs, named_result_strs)) 1827 #print (err_msg, file=sys.stderr) 1828 raise error_too_many_args(err_msg) 1829 if len(named_arguments): 1830 err_msg = ("Duplicate, conflicting or unrecognised arguments:\n%s" % 1831 get_parsed_arguments_str_for_errors(task_description, # bad arg in context of parsed 1832 ", ".join("%s=%r" % ( 1833 k, v) for k, v in named_arguments.items()), 1834 unnamed_result_strs, named_result_strs)) 1835 #print (err_msg, file=sys.stderr) 1836 raise error_too_many_args(err_msg) 1837 1838 return results 1839 1840# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 1841 1842# special markers used by @files_re 1843 1844# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 1845 1846 1847class combine(object): 1848 def __init__(self, *args): 1849 self.args = args 1850 1851 1852class output_from(object): 1853 def __init__(self, *args): 1854 self.args = args 1855 1856 def __repr__(self, *args): 1857 return 'output_from%r' % (self.args,) 1858 1859 1860class runtime_parameter(object): 1861 def __init__(self, *args): 1862 if len(args) != 1 or not isinstance(args[0], path_str_type): 1863 raise Exception( 1864 "runtime_parameter takes the name of the run time parameter as a single string") 1865 self.args = args 1866 1867 1868# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 1869 1870# Unit Testing code in test/test_ruffus_utility.py 1871 1872 1873# 88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888 1874