1#!/usr/bin/env python 2 3""" MultiQC report module. Holds the output from each 4module. Is available to subsequent modules. Contains 5helper functions to generate markup for report. """ 6 7from __future__ import print_function 8from collections import defaultdict, OrderedDict 9import fnmatch 10import inspect 11import io 12import json 13import lzstring 14import mimetypes 15import os 16import re 17import rich.progress 18import time 19import yaml 20 21from multiqc import config 22 23logger = config.logger 24 25# Treat defaultdict and OrderedDict as normal dicts for YAML output 26from yaml.representer import Representer, SafeRepresenter 27 28yaml.add_representer(defaultdict, Representer.represent_dict) 29yaml.add_representer(OrderedDict, Representer.represent_dict) 30try: 31 yaml.add_representer(unicode, SafeRepresenter.represent_unicode) 32except NameError: 33 pass # Python 3 34 35# Set up global variables shared across modules 36general_stats_data = list() 37general_stats_headers = list() 38general_stats_html = "" 39data_sources = defaultdict(lambda: defaultdict(lambda: defaultdict())) 40plot_data = dict() 41html_ids = list() 42lint_errors = list() 43num_hc_plots = 0 44num_mpl_plots = 0 45saved_raw_data = dict() 46last_found_file = None 47runtimes = { 48 "total": 0, 49 "total_sp": 0, 50 "total_mods": 0, 51 "total_compression": 0, 52 "sp": defaultdict(), 53 "mods": defaultdict(), 54} 55file_search_stats = { 56 "skipped_symlinks": 0, 57 "skipped_not_a_file": 0, 58 "skipped_ignore_pattern": 0, 59 "skipped_filesize_limit": 0, 60 "skipped_no_match": 0, 61} 62 63# Make a dict of discovered files for each seach key 64searchfiles = list() 65files = dict() 66 67 68def get_filelist(run_module_names): 69 """ 70 Go through all supplied search directories and assembly a master 71 list of files to search. Then fire search functions for each file. 72 """ 73 # Prep search patterns 74 spatterns = [{}, {}, {}, {}, {}, {}, {}] 75 epatterns = [{}, {}] 76 runtimes["sp"] = defaultdict() 77 ignored_patterns = [] 78 for key, sps in config.sp.items(): 79 mod_name = key.split("/", 1)[0] 80 if mod_name.lower() not in [m.lower() for m in run_module_names]: 81 ignored_patterns.append(key) 82 continue 83 files[key] = list() 84 if not isinstance(sps, list): 85 sps = [sps] 86 87 # Warn if we have any unrecognised search pattern keys 88 expected_sp_keys = [ 89 "fn", 90 "fn_re", 91 "contents", 92 "contents_re", 93 "num_lines", 94 "shared", 95 "skip", 96 "max_filesize", 97 "exclude_fn", 98 "exclude_fn_re", 99 "exclude_contents", 100 "exclude_contents_re", 101 ] 102 unrecognised_keys = [y for x in sps for y in x.keys() if y not in expected_sp_keys] 103 if len(unrecognised_keys) > 0: 104 logger.warning("Unrecognised search pattern keys for '{}': {}".format(key, ", ".join(unrecognised_keys))) 105 106 # Check if we are skipping this search key 107 if any([x.get("skip") for x in sps]): 108 logger.warn("Skipping search pattern: {}".format(key)) 109 continue 110 111 # Split search patterns according to speed of execution. 112 if any([x for x in sps if "contents_re" in x]): 113 if any([x for x in sps if "num_lines" in x]): 114 spatterns[4][key] = sps 115 elif any([x for x in sps if "max_filesize" in x]): 116 spatterns[5][key] = sps 117 else: 118 spatterns[6][key] = sps 119 elif any([x for x in sps if "contents" in x]): 120 if any([x for x in sps if "num_lines" in x]): 121 spatterns[1][key] = sps 122 elif any([x for x in sps if "max_filesize" in x]): 123 spatterns[2][key] = sps 124 else: 125 spatterns[3][key] = sps 126 else: 127 spatterns[0][key] = sps 128 129 if len(ignored_patterns) > 0: 130 logger.debug("Ignored {} search patterns as didn't match running modules.".format(len(ignored_patterns))) 131 132 def add_file(fn, root): 133 """ 134 Function applied to each file found when walking the analysis 135 directories. Runs through all search patterns and returns True 136 if a match is found. 137 """ 138 f = {"fn": fn, "root": root} 139 140 # Check that this is a file and not a pipe or anything weird 141 if not os.path.isfile(os.path.join(root, fn)): 142 file_search_stats["skipped_not_a_file"] += 1 143 return False 144 145 # Check that we don't want to ignore this file 146 i_matches = [n for n in config.fn_ignore_files if fnmatch.fnmatch(fn, n)] 147 if len(i_matches) > 0: 148 logger.debug("Ignoring file as matched an ignore pattern: {}".format(fn)) 149 file_search_stats["skipped_ignore_pattern"] += 1 150 return False 151 152 # Limit search to small files, to avoid 30GB FastQ files etc. 153 try: 154 f["filesize"] = os.path.getsize(os.path.join(root, fn)) 155 except (IOError, OSError, ValueError, UnicodeDecodeError): 156 logger.debug("Couldn't read file when checking filesize: {}".format(fn)) 157 else: 158 if f["filesize"] > config.log_filesize_limit: 159 file_search_stats["skipped_filesize_limit"] += 1 160 return False 161 162 # Test file for each search pattern 163 file_matched = False 164 for patterns in spatterns: 165 for key, sps in patterns.items(): 166 start = time.time() 167 for sp in sps: 168 if search_file(sp, f, key): 169 # Check that we shouldn't exclude this file 170 if not exclude_file(sp, f): 171 # Looks good! Remember this file 172 files[key].append(f) 173 file_search_stats[key] = file_search_stats.get(key, 0) + 1 174 file_matched = True 175 # Don't keep searching this file for other modules 176 if not sp.get("shared", False): 177 runtimes["sp"][key] = runtimes["sp"].get(key, 0) + (time.time() - start) 178 return True 179 # Don't look at other patterns for this module 180 else: 181 break 182 runtimes["sp"][key] = runtimes["sp"].get(key, 0) + (time.time() - start) 183 184 return file_matched 185 186 # Go through the analysis directories and get file list 187 multiqc_installation_dir_files = [ 188 "LICENSE", 189 "CHANGELOG.md", 190 "Dockerfile", 191 "MANIFEST.in", 192 ".gitmodules", 193 "README.md", 194 "CSP.txt", 195 "setup.py", 196 ".gitignore", 197 ] 198 total_sp_starttime = time.time() 199 for path in config.analysis_dir: 200 if os.path.islink(path) and config.ignore_symlinks: 201 file_search_stats["skipped_symlinks"] += 1 202 continue 203 elif os.path.isfile(path): 204 searchfiles.append([os.path.basename(path), os.path.dirname(path)]) 205 elif os.path.isdir(path): 206 for root, dirnames, filenames in os.walk(path, followlinks=(not config.ignore_symlinks), topdown=True): 207 bname = os.path.basename(root) 208 209 # Skip any sub-directories matching ignore params 210 orig_dirnames = dirnames[:] 211 for n in config.fn_ignore_dirs: 212 dirnames[:] = [d for d in dirnames if not fnmatch.fnmatch(d, n.rstrip(os.sep))] 213 if len(orig_dirnames) != len(dirnames): 214 removed_dirs = [ 215 os.path.join(root, d) for d in set(orig_dirnames).symmetric_difference(set(dirnames)) 216 ] 217 logger.debug("Ignoring directory as matched fn_ignore_dirs: {}".format(", ".join(removed_dirs))) 218 orig_dirnames = dirnames[:] 219 for n in config.fn_ignore_paths: 220 dirnames[:] = [d for d in dirnames if not fnmatch.fnmatch(os.path.join(root, d), n.rstrip(os.sep))] 221 if len(orig_dirnames) != len(dirnames): 222 removed_dirs = [ 223 os.path.join(root, d) for d in set(orig_dirnames).symmetric_difference(set(dirnames)) 224 ] 225 logger.debug( 226 "Ignoring directory as matched fn_ignore_paths: {}".format(", ".join(removed_dirs)) 227 ) 228 229 # Skip *this* directory if matches ignore params 230 d_matches = [n for n in config.fn_ignore_dirs if fnmatch.fnmatch(bname, n.rstrip(os.sep))] 231 if len(d_matches) > 0: 232 logger.debug("Ignoring directory as matched fn_ignore_dirs: {}".format(bname)) 233 continue 234 p_matches = [n for n in config.fn_ignore_paths if fnmatch.fnmatch(root, n.rstrip(os.sep))] 235 if len(p_matches) > 0: 236 logger.debug("Ignoring directory as matched fn_ignore_paths: {}".format(root)) 237 continue 238 239 # Sanity check - make sure that we're not just running in the installation directory 240 if len(filenames) > 0 and all([fn in filenames for fn in multiqc_installation_dir_files]): 241 logger.error("Error: MultiQC is running in source code directory! {}".format(root)) 242 logger.warning( 243 "Please see the docs for how to use MultiQC: https://multiqc.info/docs/#running-multiqc" 244 ) 245 dirnames[:] = [] 246 filenames[:] = [] 247 continue 248 249 # Search filenames in this directory 250 for fn in filenames: 251 searchfiles.append([fn, root]) 252 253 # Search through collected files 254 progress_obj = rich.progress.Progress( 255 "[progress.description]{task.description}", 256 rich.progress.SpinnerColumn(), 257 rich.progress.BarColumn(), 258 "[progress.percentage]{task.percentage:>3.0f}%", 259 "[green]{task.completed}/{task.total}", 260 "[dim]{task.fields[s_fn]}", 261 ) 262 with progress_obj as progress: 263 mqc_task = progress.add_task("Searching", total=len(searchfiles), s_fn="") 264 for sf in searchfiles: 265 progress.update(mqc_task, advance=1, s_fn=os.path.join(sf[1], sf[0])[-50:]) 266 if not add_file(sf[0], sf[1]): 267 file_search_stats["skipped_no_match"] += 1 268 progress.update(mqc_task, s_fn="") 269 270 runtimes["total_sp"] = time.time() - total_sp_starttime 271 272 273def search_file(pattern, f, module_key): 274 """ 275 Function to searach a single file for a single search pattern. 276 """ 277 278 fn_matched = False 279 contents_matched = False 280 281 # Use mimetypes to exclude binary files where possible 282 if not re.match(r".+_mqc\.(png|jpg|jpeg)", f["fn"]) and config.ignore_images: 283 (ftype, encoding) = mimetypes.guess_type(os.path.join(f["root"], f["fn"])) 284 if encoding is not None: 285 return False 286 if ftype is not None and ftype.startswith("image"): 287 return False 288 289 # Search pattern specific filesize limit 290 if pattern.get("max_filesize") is not None and "filesize" in f: 291 if f["filesize"] > pattern.get("max_filesize"): 292 logger.debug( 293 "File ignored by {} because it exceeded search pattern filesize limit: {}".format(module_key, f["fn"]) 294 ) 295 return False 296 297 # Search by file name (glob) 298 if pattern.get("fn") is not None: 299 if fnmatch.fnmatch(f["fn"], pattern["fn"]): 300 fn_matched = True 301 if pattern.get("contents") is None and pattern.get("contents_re") is None: 302 return True 303 304 # Search by file name (regex) 305 if pattern.get("fn_re") is not None: 306 if re.match(pattern["fn_re"], f["fn"]): 307 fn_matched = True 308 if pattern.get("contents") is None and pattern.get("contents_re") is None: 309 return True 310 311 # Search by file contents 312 if pattern.get("contents") is not None or pattern.get("contents_re") is not None: 313 if pattern.get("contents_re") is not None: 314 repattern = re.compile(pattern["contents_re"]) 315 try: 316 with io.open(os.path.join(f["root"], f["fn"]), "r", encoding="utf-8") as f: 317 l = 1 318 for line in f: 319 # Search by file contents (string) 320 if pattern.get("contents") is not None: 321 if pattern["contents"] in line: 322 contents_matched = True 323 if pattern.get("fn") is None and pattern.get("fn_re") is None: 324 return True 325 break 326 # Search by file contents (regex) 327 elif pattern.get("contents_re") is not None: 328 if re.search(repattern, line): 329 contents_matched = True 330 if pattern.get("fn") is None and pattern.get("fn_re") is None: 331 return True 332 break 333 # Break if we've searched enough lines for this pattern 334 if pattern.get("num_lines") and l >= pattern.get("num_lines"): 335 break 336 l += 1 337 except (IOError, OSError, ValueError, UnicodeDecodeError): 338 if config.report_readerrors: 339 logger.debug("Couldn't read file when looking for output: {}".format(f["fn"])) 340 return False 341 342 return fn_matched and contents_matched 343 344 345def exclude_file(sp, f): 346 """ 347 Exclude discovered files if they match the special exclude_ 348 search pattern keys 349 """ 350 # Make everything a list if it isn't already 351 for k in sp: 352 if k in ["exclude_fn", "exclude_fn_re" "exclude_contents", "exclude_contents_re"]: 353 if not isinstance(sp[k], list): 354 sp[k] = [sp[k]] 355 356 # Search by file name (glob) 357 if "exclude_fn" in sp: 358 for pat in sp["exclude_fn"]: 359 if fnmatch.fnmatch(f["fn"], pat): 360 return True 361 362 # Search by file name (regex) 363 if "exclude_fn_re" in sp: 364 for pat in sp["exclude_fn_re"]: 365 if re.match(pat, f["fn"]): 366 return True 367 368 # Search the contents of the file 369 if "exclude_contents" in sp or "exclude_contents_re" in sp: 370 # Compile regex patterns if we have any 371 if "exclude_contents_re" in sp: 372 sp["exclude_contents_re"] = [re.compile(pat) for pat in sp["exclude_contents_re"]] 373 with io.open(os.path.join(f["root"], f["fn"]), "r", encoding="utf-8") as fh: 374 for line in fh: 375 if "exclude_contents" in sp: 376 for pat in sp["exclude_contents"]: 377 if pat in line: 378 return True 379 if "exclude_contents_re" in sp: 380 for pat in sp["exclude_contents_re"]: 381 if re.search(pat, line): 382 return True 383 return False 384 385 386def data_sources_tofile(): 387 fn = "multiqc_sources.{}".format(config.data_format_extensions[config.data_format]) 388 with io.open(os.path.join(config.data_dir, fn), "w", encoding="utf-8") as f: 389 if config.data_format == "json": 390 jsonstr = json.dumps(data_sources, indent=4, ensure_ascii=False) 391 print(jsonstr.encode("utf-8", "ignore").decode("utf-8"), file=f) 392 elif config.data_format == "yaml": 393 yaml.dump(data_sources, f, default_flow_style=False) 394 else: 395 lines = [["Module", "Section", "Sample Name", "Source"]] 396 for mod in data_sources: 397 for sec in data_sources[mod]: 398 for s_name, source in data_sources[mod][sec].items(): 399 lines.append([mod, sec, s_name, source]) 400 body = "\n".join(["\t".join(l) for l in lines]) 401 print(body.encode("utf-8", "ignore").decode("utf-8"), file=f) 402 403 404def save_htmlid(html_id, skiplint=False): 405 """Take a HTML ID, sanitise for HTML, check for duplicates and save. 406 Returns sanitised, unique ID""" 407 global html_ids 408 global lint_errors 409 410 # Trailing whitespace 411 html_id_clean = html_id.strip() 412 413 # Trailing underscores 414 html_id_clean = html_id_clean.strip("_") 415 416 # Must begin with a letter 417 if re.match(r"^[a-zA-Z]", html_id_clean) is None: 418 html_id_clean = "mqc_{}".format(html_id_clean) 419 420 # Replace illegal characters 421 html_id_clean = re.sub("[^a-zA-Z0-9_-]+", "_", html_id_clean) 422 423 # Validate if linting 424 if config.lint and not skiplint: 425 modname = "" 426 codeline = "" 427 callstack = inspect.stack() 428 for n in callstack: 429 if "multiqc/modules/" in n[1] and "base_module.py" not in n[1]: 430 callpath = n[1].split("multiqc/modules/", 1)[-1] 431 modname = ">{}< ".format(callpath) 432 codeline = n[4][0].strip() 433 break 434 if config.lint and not skiplint and html_id != html_id_clean: 435 errmsg = "LINT: {}HTML ID was not clean ('{}' -> '{}') ## {}".format(modname, html_id, html_id_clean, codeline) 436 logger.error(errmsg) 437 lint_errors.append(errmsg) 438 439 # Check for duplicates 440 i = 1 441 html_id_base = html_id_clean 442 while html_id_clean in html_ids: 443 html_id_clean = "{}-{}".format(html_id_base, i) 444 i += 1 445 if config.lint and not skiplint: 446 errmsg = "LINT: {}HTML ID was a duplicate ({}) ## {}".format(modname, html_id_clean, codeline) 447 logger.error(errmsg) 448 lint_errors.append(errmsg) 449 450 # Remember and return 451 html_ids.append(html_id_clean) 452 return html_id_clean 453 454 455def compress_json(data): 456 """ Take a Python data object. Convert to JSON and compress using lzstring """ 457 json_string = json.dumps(data).encode("utf-8", "ignore").decode("utf-8") 458 json_string = sanitise_json(json_string) 459 x = lzstring.LZString() 460 return x.compressToBase64(json_string) 461 462 463def sanitise_json(json_string): 464 """ 465 The Python json module uses a bunch of values which are valid JavaScript 466 but invalid JSON. These crash the browser when parsing the JSON. 467 Nothing in the MultiQC front-end uses these values, so instead we just 468 do a find-and-replace for them and switch them with `null`, which works fine. 469 470 Side effect: Any string values that include the word "Infinity" 471 (case-sensitive) will have it switched for "null". Hopefully that doesn't happen 472 a lot, otherwise we'll have to do this in a more complicated manner. 473 """ 474 json_string = re.sub(r"\bNaN\b", "null", json_string) 475 json_string = re.sub(r"\b-?Infinity\b", "null", json_string) 476 return json_string 477