1#!/usr/bin/env python
2
3""" MultiQC report module. Holds the output from each
4module. Is available to subsequent modules. Contains
5helper functions to generate markup for report. """
6
7from __future__ import print_function
8from collections import defaultdict, OrderedDict
9import fnmatch
10import inspect
11import io
12import json
13import lzstring
14import mimetypes
15import os
16import re
17import rich.progress
18import time
19import yaml
20
21from multiqc import config
22
23logger = config.logger
24
25# Treat defaultdict and OrderedDict as normal dicts for YAML output
26from yaml.representer import Representer, SafeRepresenter
27
28yaml.add_representer(defaultdict, Representer.represent_dict)
29yaml.add_representer(OrderedDict, Representer.represent_dict)
30try:
31    yaml.add_representer(unicode, SafeRepresenter.represent_unicode)
32except NameError:
33    pass  # Python 3
34
35# Set up global variables shared across modules
36general_stats_data = list()
37general_stats_headers = list()
38general_stats_html = ""
39data_sources = defaultdict(lambda: defaultdict(lambda: defaultdict()))
40plot_data = dict()
41html_ids = list()
42lint_errors = list()
43num_hc_plots = 0
44num_mpl_plots = 0
45saved_raw_data = dict()
46last_found_file = None
47runtimes = {
48    "total": 0,
49    "total_sp": 0,
50    "total_mods": 0,
51    "total_compression": 0,
52    "sp": defaultdict(),
53    "mods": defaultdict(),
54}
55file_search_stats = {
56    "skipped_symlinks": 0,
57    "skipped_not_a_file": 0,
58    "skipped_ignore_pattern": 0,
59    "skipped_filesize_limit": 0,
60    "skipped_no_match": 0,
61}
62
63# Make a dict of discovered files for each seach key
64searchfiles = list()
65files = dict()
66
67
68def get_filelist(run_module_names):
69    """
70    Go through all supplied search directories and assembly a master
71    list of files to search. Then fire search functions for each file.
72    """
73    # Prep search patterns
74    spatterns = [{}, {}, {}, {}, {}, {}, {}]
75    epatterns = [{}, {}]
76    runtimes["sp"] = defaultdict()
77    ignored_patterns = []
78    for key, sps in config.sp.items():
79        mod_name = key.split("/", 1)[0]
80        if mod_name.lower() not in [m.lower() for m in run_module_names]:
81            ignored_patterns.append(key)
82            continue
83        files[key] = list()
84        if not isinstance(sps, list):
85            sps = [sps]
86
87        # Warn if we have any unrecognised search pattern keys
88        expected_sp_keys = [
89            "fn",
90            "fn_re",
91            "contents",
92            "contents_re",
93            "num_lines",
94            "shared",
95            "skip",
96            "max_filesize",
97            "exclude_fn",
98            "exclude_fn_re",
99            "exclude_contents",
100            "exclude_contents_re",
101        ]
102        unrecognised_keys = [y for x in sps for y in x.keys() if y not in expected_sp_keys]
103        if len(unrecognised_keys) > 0:
104            logger.warning("Unrecognised search pattern keys for '{}': {}".format(key, ", ".join(unrecognised_keys)))
105
106        # Check if we are skipping this search key
107        if any([x.get("skip") for x in sps]):
108            logger.warn("Skipping search pattern: {}".format(key))
109            continue
110
111        # Split search patterns according to speed of execution.
112        if any([x for x in sps if "contents_re" in x]):
113            if any([x for x in sps if "num_lines" in x]):
114                spatterns[4][key] = sps
115            elif any([x for x in sps if "max_filesize" in x]):
116                spatterns[5][key] = sps
117            else:
118                spatterns[6][key] = sps
119        elif any([x for x in sps if "contents" in x]):
120            if any([x for x in sps if "num_lines" in x]):
121                spatterns[1][key] = sps
122            elif any([x for x in sps if "max_filesize" in x]):
123                spatterns[2][key] = sps
124            else:
125                spatterns[3][key] = sps
126        else:
127            spatterns[0][key] = sps
128
129    if len(ignored_patterns) > 0:
130        logger.debug("Ignored {} search patterns as didn't match running modules.".format(len(ignored_patterns)))
131
132    def add_file(fn, root):
133        """
134        Function applied to each file found when walking the analysis
135        directories. Runs through all search patterns and returns True
136        if a match is found.
137        """
138        f = {"fn": fn, "root": root}
139
140        # Check that this is a file and not a pipe or anything weird
141        if not os.path.isfile(os.path.join(root, fn)):
142            file_search_stats["skipped_not_a_file"] += 1
143            return False
144
145        # Check that we don't want to ignore this file
146        i_matches = [n for n in config.fn_ignore_files if fnmatch.fnmatch(fn, n)]
147        if len(i_matches) > 0:
148            logger.debug("Ignoring file as matched an ignore pattern: {}".format(fn))
149            file_search_stats["skipped_ignore_pattern"] += 1
150            return False
151
152        # Limit search to small files, to avoid 30GB FastQ files etc.
153        try:
154            f["filesize"] = os.path.getsize(os.path.join(root, fn))
155        except (IOError, OSError, ValueError, UnicodeDecodeError):
156            logger.debug("Couldn't read file when checking filesize: {}".format(fn))
157        else:
158            if f["filesize"] > config.log_filesize_limit:
159                file_search_stats["skipped_filesize_limit"] += 1
160                return False
161
162        # Test file for each search pattern
163        file_matched = False
164        for patterns in spatterns:
165            for key, sps in patterns.items():
166                start = time.time()
167                for sp in sps:
168                    if search_file(sp, f, key):
169                        # Check that we shouldn't exclude this file
170                        if not exclude_file(sp, f):
171                            # Looks good! Remember this file
172                            files[key].append(f)
173                            file_search_stats[key] = file_search_stats.get(key, 0) + 1
174                            file_matched = True
175                        # Don't keep searching this file for other modules
176                        if not sp.get("shared", False):
177                            runtimes["sp"][key] = runtimes["sp"].get(key, 0) + (time.time() - start)
178                            return True
179                        # Don't look at other patterns for this module
180                        else:
181                            break
182                runtimes["sp"][key] = runtimes["sp"].get(key, 0) + (time.time() - start)
183
184        return file_matched
185
186    # Go through the analysis directories and get file list
187    multiqc_installation_dir_files = [
188        "LICENSE",
189        "CHANGELOG.md",
190        "Dockerfile",
191        "MANIFEST.in",
192        ".gitmodules",
193        "README.md",
194        "CSP.txt",
195        "setup.py",
196        ".gitignore",
197    ]
198    total_sp_starttime = time.time()
199    for path in config.analysis_dir:
200        if os.path.islink(path) and config.ignore_symlinks:
201            file_search_stats["skipped_symlinks"] += 1
202            continue
203        elif os.path.isfile(path):
204            searchfiles.append([os.path.basename(path), os.path.dirname(path)])
205        elif os.path.isdir(path):
206            for root, dirnames, filenames in os.walk(path, followlinks=(not config.ignore_symlinks), topdown=True):
207                bname = os.path.basename(root)
208
209                # Skip any sub-directories matching ignore params
210                orig_dirnames = dirnames[:]
211                for n in config.fn_ignore_dirs:
212                    dirnames[:] = [d for d in dirnames if not fnmatch.fnmatch(d, n.rstrip(os.sep))]
213                    if len(orig_dirnames) != len(dirnames):
214                        removed_dirs = [
215                            os.path.join(root, d) for d in set(orig_dirnames).symmetric_difference(set(dirnames))
216                        ]
217                        logger.debug("Ignoring directory as matched fn_ignore_dirs: {}".format(", ".join(removed_dirs)))
218                        orig_dirnames = dirnames[:]
219                for n in config.fn_ignore_paths:
220                    dirnames[:] = [d for d in dirnames if not fnmatch.fnmatch(os.path.join(root, d), n.rstrip(os.sep))]
221                    if len(orig_dirnames) != len(dirnames):
222                        removed_dirs = [
223                            os.path.join(root, d) for d in set(orig_dirnames).symmetric_difference(set(dirnames))
224                        ]
225                        logger.debug(
226                            "Ignoring directory as matched fn_ignore_paths: {}".format(", ".join(removed_dirs))
227                        )
228
229                # Skip *this* directory if matches ignore params
230                d_matches = [n for n in config.fn_ignore_dirs if fnmatch.fnmatch(bname, n.rstrip(os.sep))]
231                if len(d_matches) > 0:
232                    logger.debug("Ignoring directory as matched fn_ignore_dirs: {}".format(bname))
233                    continue
234                p_matches = [n for n in config.fn_ignore_paths if fnmatch.fnmatch(root, n.rstrip(os.sep))]
235                if len(p_matches) > 0:
236                    logger.debug("Ignoring directory as matched fn_ignore_paths: {}".format(root))
237                    continue
238
239                # Sanity check - make sure that we're not just running in the installation directory
240                if len(filenames) > 0 and all([fn in filenames for fn in multiqc_installation_dir_files]):
241                    logger.error("Error: MultiQC is running in source code directory! {}".format(root))
242                    logger.warning(
243                        "Please see the docs for how to use MultiQC: https://multiqc.info/docs/#running-multiqc"
244                    )
245                    dirnames[:] = []
246                    filenames[:] = []
247                    continue
248
249                # Search filenames in this directory
250                for fn in filenames:
251                    searchfiles.append([fn, root])
252
253    # Search through collected files
254    progress_obj = rich.progress.Progress(
255        "[progress.description]{task.description}",
256        rich.progress.SpinnerColumn(),
257        rich.progress.BarColumn(),
258        "[progress.percentage]{task.percentage:>3.0f}%",
259        "[green]{task.completed}/{task.total}",
260        "[dim]{task.fields[s_fn]}",
261    )
262    with progress_obj as progress:
263        mqc_task = progress.add_task("Searching", total=len(searchfiles), s_fn="")
264        for sf in searchfiles:
265            progress.update(mqc_task, advance=1, s_fn=os.path.join(sf[1], sf[0])[-50:])
266            if not add_file(sf[0], sf[1]):
267                file_search_stats["skipped_no_match"] += 1
268        progress.update(mqc_task, s_fn="")
269
270    runtimes["total_sp"] = time.time() - total_sp_starttime
271
272
273def search_file(pattern, f, module_key):
274    """
275    Function to searach a single file for a single search pattern.
276    """
277
278    fn_matched = False
279    contents_matched = False
280
281    # Use mimetypes to exclude binary files where possible
282    if not re.match(r".+_mqc\.(png|jpg|jpeg)", f["fn"]) and config.ignore_images:
283        (ftype, encoding) = mimetypes.guess_type(os.path.join(f["root"], f["fn"]))
284        if encoding is not None:
285            return False
286        if ftype is not None and ftype.startswith("image"):
287            return False
288
289    # Search pattern specific filesize limit
290    if pattern.get("max_filesize") is not None and "filesize" in f:
291        if f["filesize"] > pattern.get("max_filesize"):
292            logger.debug(
293                "File ignored by {} because it exceeded search pattern filesize limit: {}".format(module_key, f["fn"])
294            )
295            return False
296
297    # Search by file name (glob)
298    if pattern.get("fn") is not None:
299        if fnmatch.fnmatch(f["fn"], pattern["fn"]):
300            fn_matched = True
301            if pattern.get("contents") is None and pattern.get("contents_re") is None:
302                return True
303
304    # Search by file name (regex)
305    if pattern.get("fn_re") is not None:
306        if re.match(pattern["fn_re"], f["fn"]):
307            fn_matched = True
308            if pattern.get("contents") is None and pattern.get("contents_re") is None:
309                return True
310
311    # Search by file contents
312    if pattern.get("contents") is not None or pattern.get("contents_re") is not None:
313        if pattern.get("contents_re") is not None:
314            repattern = re.compile(pattern["contents_re"])
315        try:
316            with io.open(os.path.join(f["root"], f["fn"]), "r", encoding="utf-8") as f:
317                l = 1
318                for line in f:
319                    # Search by file contents (string)
320                    if pattern.get("contents") is not None:
321                        if pattern["contents"] in line:
322                            contents_matched = True
323                            if pattern.get("fn") is None and pattern.get("fn_re") is None:
324                                return True
325                            break
326                    # Search by file contents (regex)
327                    elif pattern.get("contents_re") is not None:
328                        if re.search(repattern, line):
329                            contents_matched = True
330                            if pattern.get("fn") is None and pattern.get("fn_re") is None:
331                                return True
332                            break
333                    # Break if we've searched enough lines for this pattern
334                    if pattern.get("num_lines") and l >= pattern.get("num_lines"):
335                        break
336                    l += 1
337        except (IOError, OSError, ValueError, UnicodeDecodeError):
338            if config.report_readerrors:
339                logger.debug("Couldn't read file when looking for output: {}".format(f["fn"]))
340                return False
341
342    return fn_matched and contents_matched
343
344
345def exclude_file(sp, f):
346    """
347    Exclude discovered files if they match the special exclude_
348    search pattern keys
349    """
350    # Make everything a list if it isn't already
351    for k in sp:
352        if k in ["exclude_fn", "exclude_fn_re" "exclude_contents", "exclude_contents_re"]:
353            if not isinstance(sp[k], list):
354                sp[k] = [sp[k]]
355
356    # Search by file name (glob)
357    if "exclude_fn" in sp:
358        for pat in sp["exclude_fn"]:
359            if fnmatch.fnmatch(f["fn"], pat):
360                return True
361
362    # Search by file name (regex)
363    if "exclude_fn_re" in sp:
364        for pat in sp["exclude_fn_re"]:
365            if re.match(pat, f["fn"]):
366                return True
367
368    # Search the contents of the file
369    if "exclude_contents" in sp or "exclude_contents_re" in sp:
370        # Compile regex patterns if we have any
371        if "exclude_contents_re" in sp:
372            sp["exclude_contents_re"] = [re.compile(pat) for pat in sp["exclude_contents_re"]]
373        with io.open(os.path.join(f["root"], f["fn"]), "r", encoding="utf-8") as fh:
374            for line in fh:
375                if "exclude_contents" in sp:
376                    for pat in sp["exclude_contents"]:
377                        if pat in line:
378                            return True
379                if "exclude_contents_re" in sp:
380                    for pat in sp["exclude_contents_re"]:
381                        if re.search(pat, line):
382                            return True
383    return False
384
385
386def data_sources_tofile():
387    fn = "multiqc_sources.{}".format(config.data_format_extensions[config.data_format])
388    with io.open(os.path.join(config.data_dir, fn), "w", encoding="utf-8") as f:
389        if config.data_format == "json":
390            jsonstr = json.dumps(data_sources, indent=4, ensure_ascii=False)
391            print(jsonstr.encode("utf-8", "ignore").decode("utf-8"), file=f)
392        elif config.data_format == "yaml":
393            yaml.dump(data_sources, f, default_flow_style=False)
394        else:
395            lines = [["Module", "Section", "Sample Name", "Source"]]
396            for mod in data_sources:
397                for sec in data_sources[mod]:
398                    for s_name, source in data_sources[mod][sec].items():
399                        lines.append([mod, sec, s_name, source])
400            body = "\n".join(["\t".join(l) for l in lines])
401            print(body.encode("utf-8", "ignore").decode("utf-8"), file=f)
402
403
404def save_htmlid(html_id, skiplint=False):
405    """Take a HTML ID, sanitise for HTML, check for duplicates and save.
406    Returns sanitised, unique ID"""
407    global html_ids
408    global lint_errors
409
410    # Trailing whitespace
411    html_id_clean = html_id.strip()
412
413    # Trailing underscores
414    html_id_clean = html_id_clean.strip("_")
415
416    # Must begin with a letter
417    if re.match(r"^[a-zA-Z]", html_id_clean) is None:
418        html_id_clean = "mqc_{}".format(html_id_clean)
419
420    # Replace illegal characters
421    html_id_clean = re.sub("[^a-zA-Z0-9_-]+", "_", html_id_clean)
422
423    # Validate if linting
424    if config.lint and not skiplint:
425        modname = ""
426        codeline = ""
427        callstack = inspect.stack()
428        for n in callstack:
429            if "multiqc/modules/" in n[1] and "base_module.py" not in n[1]:
430                callpath = n[1].split("multiqc/modules/", 1)[-1]
431                modname = ">{}< ".format(callpath)
432                codeline = n[4][0].strip()
433                break
434    if config.lint and not skiplint and html_id != html_id_clean:
435        errmsg = "LINT: {}HTML ID was not clean ('{}' -> '{}') ## {}".format(modname, html_id, html_id_clean, codeline)
436        logger.error(errmsg)
437        lint_errors.append(errmsg)
438
439    # Check for duplicates
440    i = 1
441    html_id_base = html_id_clean
442    while html_id_clean in html_ids:
443        html_id_clean = "{}-{}".format(html_id_base, i)
444        i += 1
445        if config.lint and not skiplint:
446            errmsg = "LINT: {}HTML ID was a duplicate ({}) ## {}".format(modname, html_id_clean, codeline)
447            logger.error(errmsg)
448            lint_errors.append(errmsg)
449
450    # Remember and return
451    html_ids.append(html_id_clean)
452    return html_id_clean
453
454
455def compress_json(data):
456    """ Take a Python data object. Convert to JSON and compress using lzstring """
457    json_string = json.dumps(data).encode("utf-8", "ignore").decode("utf-8")
458    json_string = sanitise_json(json_string)
459    x = lzstring.LZString()
460    return x.compressToBase64(json_string)
461
462
463def sanitise_json(json_string):
464    """
465    The Python json module uses a bunch of values which are valid JavaScript
466    but invalid JSON. These crash the browser when parsing the JSON.
467    Nothing in the MultiQC front-end uses these values, so instead we just
468    do a find-and-replace for them and switch them with `null`, which works fine.
469
470    Side effect: Any string values that include the word "Infinity"
471    (case-sensitive) will have it switched for "null". Hopefully that doesn't happen
472    a lot, otherwise we'll have to do this in a more complicated manner.
473    """
474    json_string = re.sub(r"\bNaN\b", "null", json_string)
475    json_string = re.sub(r"\b-?Infinity\b", "null", json_string)
476    return json_string
477