1"""
2Downloading NeuroImaging datasets: functional datasets (task + resting-state)
3"""
4import fnmatch
5import glob
6import warnings
7import os
8import re
9import json
10
11import nibabel as nib
12import numpy as np
13import numbers
14
15from io import BytesIO
16
17import nibabel
18import pandas as pd
19from scipy.io import loadmat
20from scipy.io.matlab.miobase import MatReadError
21from sklearn.utils import Bunch, deprecated
22
23from .utils import (_get_dataset_dir, _fetch_files, _get_dataset_descr,
24                    _read_md5_sum_file, _tree, _filter_columns, _fetch_file, _uncompress_file)
25from .._utils import check_niimg, fill_doc
26from .._utils.numpy_conversions import csv_to_array
27from nilearn.image import get_data
28
29
30@fill_doc
31def fetch_haxby(data_dir=None, subjects=(2,),
32                fetch_stimuli=False, url=None, resume=True, verbose=1):
33    """Download and loads complete haxby dataset.
34
35    See :footcite:`Haxby2425`.
36
37    Parameters
38    ----------
39    %(data_dir)s
40    subjects : list or int, optional
41        Either a list of subjects or the number of subjects to load, from 1 to
42        6. By default, 2nd subject will be loaded. Empty list returns no subject
43        data. Default=(2,).
44
45    fetch_stimuli : boolean, optional
46        Indicate if stimuli images must be downloaded. They will be presented
47        as a dictionary of categories. Default=False.
48    %(url)s
49    %(resume)s
50    %(verbose)s
51
52    Returns
53    -------
54    data : sklearn.datasets.base.Bunch
55        Dictionary-like object, the interest attributes are :
56
57        - 'anat': string list. Paths to anatomic images.
58        - 'func': string list. Paths to nifti file with bold data.
59        - 'session_target': string list. Paths to text file containing session and target data.
60        - 'mask': string. Path to fullbrain mask file.
61        - 'mask_vt': string list. Paths to nifti ventral temporal mask file.
62        - 'mask_face': string list. Paths to nifti ventral temporal mask file.
63        - 'mask_house': string list. Paths to nifti ventral temporal mask file.
64        - 'mask_face_little': string list. Paths to nifti ventral temporal mask file.
65        - 'mask_house_little': string list. Paths to nifti ventral temporal mask file.
66
67    References
68    ----------
69    .. footbibliography::
70
71    Notes
72    -----
73    PyMVPA provides a tutorial making use of this dataset:
74    http://www.pymvpa.org/tutorial.html
75
76    More information about its structure:
77    http://dev.pymvpa.org/datadb/haxby2001.html
78
79    See `additional information
80    <http://www.sciencemag.org/content/293/5539/2425>`
81
82    Run 8 in subject 5 does not contain any task labels.
83    The anatomical image for subject 6 is unavailable.
84
85    """
86    if isinstance(subjects, numbers.Number) and subjects > 6:
87        subjects = 6
88
89    if subjects is not None and (isinstance(subjects, list) or
90                                 isinstance(subjects, tuple)):
91        for sub_id in subjects:
92            if sub_id not in [1, 2, 3, 4, 5, 6]:
93                raise ValueError("You provided invalid subject id {0} in a "
94                                 "list. Subjects must be selected in "
95                                 "[1, 2, 3, 4, 5, 6]".format(sub_id))
96
97    dataset_name = 'haxby2001'
98    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
99                                verbose=verbose)
100
101    # Get the mask
102    url_mask = 'https://www.nitrc.org/frs/download.php/7868/mask.nii.gz'
103    mask = _fetch_files(data_dir, [('mask.nii.gz', url_mask, {})],
104                        verbose=verbose)[0]
105
106    # Dataset files
107    if url is None:
108        url = 'http://data.pymvpa.org/datasets/haxby2001/'
109    md5sums = _fetch_files(data_dir, [('MD5SUMS', url + 'MD5SUMS', {})],
110                           verbose=verbose)[0]
111    md5sums = _read_md5_sum_file(md5sums)
112
113    # definition of dataset files
114    sub_files = ['bold.nii.gz', 'labels.txt',
115                 'mask4_vt.nii.gz', 'mask8b_face_vt.nii.gz',
116                 'mask8b_house_vt.nii.gz', 'mask8_face_vt.nii.gz',
117                 'mask8_house_vt.nii.gz', 'anat.nii.gz']
118    n_files = len(sub_files)
119
120    if subjects is None:
121        subjects = []
122
123    if isinstance(subjects, numbers.Number):
124        subject_mask = np.arange(1, subjects + 1)
125    else:
126        subject_mask = np.array(subjects)
127
128    files = [
129            (os.path.join('subj%d' % i, sub_file),
130             url + 'subj%d-2010.01.14.tar.gz' % i,
131             {'uncompress': True,
132              'md5sum': md5sums.get('subj%d-2010.01.14.tar.gz' % i, None)})
133            for i in subject_mask
134            for sub_file in sub_files
135            if not (sub_file == 'anat.nii.gz' and i == 6)  # no anat for sub. 6
136    ]
137
138    files = _fetch_files(data_dir, files, resume=resume, verbose=verbose)
139
140    if ((isinstance(subjects, numbers.Number) and subjects == 6) or
141            np.any(subject_mask == 6)):
142        files.append(None)  # None value because subject 6 has no anat
143
144    kwargs = {}
145    if fetch_stimuli:
146        stimuli_files = [(os.path.join('stimuli', 'README'),
147                          url + 'stimuli-2010.01.14.tar.gz',
148                          {'uncompress': True})]
149        readme = _fetch_files(data_dir, stimuli_files, resume=resume,
150                              verbose=verbose)[0]
151        kwargs['stimuli'] = _tree(os.path.dirname(readme), pattern='*.jpg',
152                                  dictionary=True)
153
154    fdescr = _get_dataset_descr(dataset_name)
155
156    # return the data
157    return Bunch(
158            anat=files[7::n_files],
159            func=files[0::n_files],
160            session_target=files[1::n_files],
161            mask_vt=files[2::n_files],
162            mask_face=files[3::n_files],
163            mask_house=files[4::n_files],
164            mask_face_little=files[5::n_files],
165            mask_house_little=files[6::n_files],
166            mask=mask,
167            description=fdescr,
168            **kwargs)
169
170
171@fill_doc
172def fetch_adhd(n_subjects=30, data_dir=None, url=None, resume=True,
173               verbose=1):
174    """Download and load the ADHD resting-state dataset.
175
176    See :footcite:`ADHDdataset`.
177
178    Parameters
179    ----------
180    n_subjects : int, optional
181        The number of subjects to load from maximum of 40 subjects.
182        By default, 30 subjects will be loaded. If None is given,
183        all 40 subjects will be loaded. Default=30.
184    %(data_dir)s
185    %(url)s
186    %(resume)s
187    %(verbose)s
188
189    Returns
190    -------
191    data : sklearn.datasets.base.Bunch
192        Dictionary-like object, the interest attributes are :
193
194         - 'func': Paths to functional resting-state images
195         - 'phenotypic': Explanations of preprocessing steps
196         - 'confounds': CSV files containing the nuisance variables
197
198    References
199    ----------
200    .. footbibliography::
201
202    """
203    if url is None:
204        url = 'https://www.nitrc.org/frs/download.php/'
205
206    # Preliminary checks and declarations
207    dataset_name = 'adhd'
208    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
209                                verbose=verbose)
210    ids = ['0010042', '0010064', '0010128', '0021019', '0023008', '0023012',
211           '0027011', '0027018', '0027034', '0027037', '1019436', '1206380',
212           '1418396', '1517058', '1552181', '1562298', '1679142', '2014113',
213           '2497695', '2950754', '3007585', '3154996', '3205761', '3520880',
214           '3624598', '3699991', '3884955', '3902469', '3994098', '4016887',
215           '4046678', '4134561', '4164316', '4275075', '6115230', '7774305',
216           '8409791', '8697774', '9744150', '9750701']
217    nitrc_ids = range(7782, 7822)
218    max_subjects = len(ids)
219    if n_subjects is None:
220        n_subjects = max_subjects
221    if n_subjects > max_subjects:
222        warnings.warn('Warning: there are only %d subjects' % max_subjects)
223        n_subjects = max_subjects
224    ids = ids[:n_subjects]
225    nitrc_ids = nitrc_ids[:n_subjects]
226
227    opts = dict(uncompress=True)
228
229    # Dataset description
230    fdescr = _get_dataset_descr(dataset_name)
231
232    # First, get the metadata
233    phenotypic = ('ADHD200_40subs_motion_parameters_and_phenotypics.csv',
234        url + '7781/adhd40_metadata.tgz', opts)
235
236    phenotypic = _fetch_files(data_dir, [phenotypic], resume=resume,
237                              verbose=verbose)[0]
238
239    # Load the csv file
240    phenotypic = np.genfromtxt(phenotypic, names=True, delimiter=',',
241                               dtype=None)
242
243    # Keep phenotypic information for selected subjects
244    int_ids = np.asarray(ids, dtype=int)
245    phenotypic = phenotypic[[np.where(phenotypic['Subject'] == i)[0][0]
246                             for i in int_ids]]
247
248    # Download dataset files
249
250    archives = [url + '%i/adhd40_%s.tgz' % (ni, ii)
251                for ni, ii in zip(nitrc_ids, ids)]
252    functionals = ['data/%s/%s_rest_tshift_RPI_voreg_mni.nii.gz' % (i, i)
253                   for i in ids]
254    confounds = ['data/%s/%s_regressors.csv' % (i, i) for i in ids]
255
256    functionals = _fetch_files(
257        data_dir, zip(functionals, archives, (opts,) * n_subjects),
258        resume=resume, verbose=verbose)
259
260    confounds = _fetch_files(
261        data_dir, zip(confounds, archives, (opts,) * n_subjects),
262        resume=resume, verbose=verbose)
263
264    return Bunch(func=functionals, confounds=confounds,
265                 phenotypic=phenotypic, description=fdescr)
266
267
268@fill_doc
269def fetch_miyawaki2008(data_dir=None, url=None, resume=True, verbose=1):
270    """Download and loads Miyawaki et al. 2008 dataset (153MB).
271
272    See :footcite:`MIYAWAKI2008915`.
273
274    Parameters
275    ----------
276    %(data_dir)s
277    %(url)s
278    %(resume)s
279    %(verbose)s
280
281    Returns
282    -------
283    data : Bunch
284        Dictionary-like object, the interest attributes are :
285
286        - 'func': string list
287            Paths to nifti file with bold data
288        - 'label': string list
289            Paths to text file containing session and target data
290        - 'mask': string
291            Path to nifti mask file to define target volume in visual
292            cortex
293        - 'background': string
294            Path to nifti file containing a background image usable as a
295            background image for miyawaki images.
296
297    References
298    ----------
299    .. footbibliography::
300
301    Notes
302    -----
303    This dataset is available on the `brainliner website
304    <http://brainliner.jp/data/brainliner-admin/Reconstruct>`_
305
306    See `additional information
307    <http://www.cns.atr.jp/dni/en/downloads/
308    fmri-data-set-for-visual-image-reconstruction/>`_
309
310    """
311    url = 'https://www.nitrc.org/frs/download.php' \
312          '/8486/miyawaki2008.tgz?i_agree=1&download_now=1'
313    opts = {'uncompress': True}
314
315    # Dataset files
316
317    # Functional MRI:
318    #   * 20 random scans (usually used for training)
319    #   * 12 figure scans (usually used for testing)
320
321    func_figure = [(os.path.join('func', 'data_figure_run%02d.nii.gz' % i),
322                    url, opts) for i in range(1, 13)]
323
324    func_random = [(os.path.join('func', 'data_random_run%02d.nii.gz' % i),
325                    url, opts) for i in range(1, 21)]
326
327    # Labels, 10x10 patches, stimuli shown to the subject:
328    #   * 20 random labels
329    #   * 12 figure labels (letters and shapes)
330
331    label_filename = 'data_%s_run%02d_label.csv'
332    label_figure = [(os.path.join('label', label_filename % ('figure', i)),
333                     url, opts) for i in range(1, 13)]
334
335    label_random = [(os.path.join('label', label_filename % ('random', i)),
336                     url, opts) for i in range(1, 21)]
337
338    # Masks
339
340    file_mask = [
341        'mask.nii.gz',
342        'LHlag0to1.nii.gz',
343        'LHlag10to11.nii.gz',
344        'LHlag1to2.nii.gz',
345        'LHlag2to3.nii.gz',
346        'LHlag3to4.nii.gz',
347        'LHlag4to5.nii.gz',
348        'LHlag5to6.nii.gz',
349        'LHlag6to7.nii.gz',
350        'LHlag7to8.nii.gz',
351        'LHlag8to9.nii.gz',
352        'LHlag9to10.nii.gz',
353        'LHV1d.nii.gz',
354        'LHV1v.nii.gz',
355        'LHV2d.nii.gz',
356        'LHV2v.nii.gz',
357        'LHV3A.nii.gz',
358        'LHV3.nii.gz',
359        'LHV4v.nii.gz',
360        'LHVP.nii.gz',
361        'RHlag0to1.nii.gz',
362        'RHlag10to11.nii.gz',
363        'RHlag1to2.nii.gz',
364        'RHlag2to3.nii.gz',
365        'RHlag3to4.nii.gz',
366        'RHlag4to5.nii.gz',
367        'RHlag5to6.nii.gz',
368        'RHlag6to7.nii.gz',
369        'RHlag7to8.nii.gz',
370        'RHlag8to9.nii.gz',
371        'RHlag9to10.nii.gz',
372        'RHV1d.nii.gz',
373        'RHV1v.nii.gz',
374        'RHV2d.nii.gz',
375        'RHV2v.nii.gz',
376        'RHV3A.nii.gz',
377        'RHV3.nii.gz',
378        'RHV4v.nii.gz',
379        'RHVP.nii.gz'
380    ]
381
382    file_mask = [(os.path.join('mask', m), url, opts) for m in file_mask]
383
384    file_names = func_figure + func_random + \
385                 label_figure + label_random + \
386                 file_mask
387
388    dataset_name = 'miyawaki2008'
389    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
390                                verbose=verbose)
391    files = _fetch_files(data_dir, file_names, resume=resume, verbose=verbose)
392
393    # Fetch the background image
394    bg_img = _fetch_files(data_dir, [('bg.nii.gz', url, opts)], resume=resume,
395                          verbose=verbose)[0]
396
397    fdescr = _get_dataset_descr(dataset_name)
398
399    # Return the data
400    return Bunch(
401        func=files[:32],
402        label=files[32:64],
403        mask=files[64],
404        mask_roi=files[65:],
405        background=bg_img,
406        description=fdescr)
407
408
409@fill_doc
410def fetch_localizer_contrasts(contrasts, n_subjects=None, get_tmaps=False,
411                              get_masks=False, get_anats=False,
412                              data_dir=None, url=None, resume=True, verbose=1):
413    """Download and load Brainomics/Localizer dataset (94 subjects).
414
415    "The Functional Localizer is a simple and fast acquisition
416    procedure based on a 5-minute functional magnetic resonance
417    imaging (fMRI) sequence that can be run as easily and as
418    systematically as an anatomical scan. This protocol captures the
419    cerebral bases of auditory and visual perception, motor actions,
420    reading, language comprehension and mental calculation at an
421    individual level. Individual functional maps are reliable and
422    quite precise. The procedure is described in more detail on the
423    Functional Localizer page."
424    (see https://osf.io/vhtf6/)
425
426    You may cite :footcite:`PAPADOPOULOSORFANOS2017309`
427    when using this dataset.
428
429    Scientific results obtained using this dataset are described
430    in :footcite:`Pinel2007fast`.
431
432    Parameters
433    ----------
434    contrasts : list of str
435        The contrasts to be fetched (for all 94 subjects available).
436        Allowed values are::
437
438        - "checkerboard"
439        - "horizontal checkerboard"
440        - "vertical checkerboard"
441        - "horizontal vs vertical checkerboard"
442        - "vertical vs horizontal checkerboard"
443        - "sentence listening"
444        - "sentence reading"
445        - "sentence listening and reading"
446        - "sentence reading vs checkerboard"
447        - "calculation (auditory cue)"
448        - "calculation (visual cue)"
449        - "calculation (auditory and visual cue)"
450        - "calculation (auditory cue) vs sentence listening"
451        - "calculation (visual cue) vs sentence reading"
452        - "calculation vs sentences"
453        - "calculation (auditory cue) and sentence listening"
454        - "calculation (visual cue) and sentence reading"
455        - "calculation and sentence listening/reading"
456        - "calculation (auditory cue) and sentence listening vs "
457        - "calculation (visual cue) and sentence reading"
458        - "calculation (visual cue) and sentence reading vs checkerboard"
459        - "calculation and sentence listening/reading vs button press"
460        - "left button press (auditory cue)"
461        - "left button press (visual cue)"
462        - "left button press"
463        - "left vs right button press"
464        - "right button press (auditory cue)"
465        - "right button press (visual cue)"
466        - "right button press"
467        - "right vs left button press"
468        - "button press (auditory cue) vs sentence listening"
469        - "button press (visual cue) vs sentence reading"
470        - "button press vs calculation and sentence listening/reading"
471
472        or equivalently on can use the original names::
473
474        - "checkerboard"
475        - "horizontal checkerboard"
476        - "vertical checkerboard"
477        - "horizontal vs vertical checkerboard"
478        - "vertical vs horizontal checkerboard"
479        - "auditory sentences"
480        - "visual sentences"
481        - "auditory&visual sentences"
482        - "visual sentences vs checkerboard"
483        - "auditory calculation"
484        - "visual calculation"
485        - "auditory&visual calculation"
486        - "auditory calculation vs auditory sentences"
487        - "visual calculation vs sentences"
488        - "auditory&visual calculation vs sentences"
489        - "auditory processing"
490        - "visual processing"
491        - "visual processing vs auditory processing"
492        - "auditory processing vs visual processing"
493        - "visual processing vs checkerboard"
494        - "cognitive processing vs motor"
495        - "left auditory click"
496        - "left visual click"
497        - "left auditory&visual click"
498        - "left auditory & visual click vs right auditory&visual click"
499        - "right auditory click"
500        - "right visual click"
501        - "right auditory&visual click"
502        - "right auditory & visual click vs left auditory&visual click"
503        - "auditory click vs auditory sentences"
504        - "visual click vs visual sentences"
505        - "auditory&visual motor vs cognitive processing"
506
507    n_subjects : int or list, optional
508        The number or list of subjects to load. If None is given,
509        all 94 subjects are used.
510
511    get_tmaps : boolean, optional
512        Whether t maps should be fetched or not. Default=False.
513
514    get_masks : boolean, optional
515        Whether individual masks should be fetched or not.
516        Default=False.
517
518    get_anats : boolean, optional
519        Whether individual structural images should be fetched or not.
520        Default=False.
521    %(data_dir)s
522    %(url)s
523    %(resume)s
524    %(verbose)s
525
526    Returns
527    -------
528    data : Bunch
529        Dictionary-like object, the interest attributes are :
530
531        - 'cmaps': string list
532            Paths to nifti contrast maps
533        - 'tmaps' string list (if 'get_tmaps' set to True)
534            Paths to nifti t maps
535        - 'masks': string list
536            Paths to nifti files corresponding to the subjects individual masks
537        - 'anats': string
538            Path to nifti files corresponding to the subjects structural images
539
540    References
541    ----------
542    .. footbibliography::
543
544    See Also
545    ---------
546    nilearn.datasets.fetch_localizer_calculation_task
547    nilearn.datasets.fetch_localizer_button_task
548
549    """
550    if isinstance(contrasts, str):
551        raise ValueError('Contrasts should be a list of strings, but '
552                         'a single string was given: "%s"' % contrasts)
553    if n_subjects is None:
554        n_subjects = 94  # 94 subjects available
555    if (isinstance(n_subjects, numbers.Number) and
556            ((n_subjects > 94) or (n_subjects < 1))):
557        warnings.warn("Wrong value for \'n_subjects\' (%d). The maximum "
558                      "value will be used instead (\'n_subjects=94\')")
559        n_subjects = 94  # 94 subjects available
560
561    # we allow the user to use alternatives to Brainomics contrast names
562    contrast_name_wrapper = {
563        # Checkerboard
564        "checkerboard": "checkerboard",
565        "horizontal checkerboard": "horizontal checkerboard",
566        "vertical checkerboard": "vertical checkerboard",
567        "horizontal vs vertical checkerboard":
568            "horizontal vs vertical checkerboard",
569        "vertical vs horizontal checkerboard":
570            "vertical vs horizontal checkerboard",
571        # Sentences
572        "sentence listening": "auditory sentences",
573        "sentence reading": "visual sentences",
574        "sentence listening and reading": "auditory&visual sentences",
575        "sentence reading vs checkerboard": "visual sentences vs checkerboard",
576        # Calculation
577        "calculation (auditory cue)": "auditory calculation",
578        "calculation (visual cue)": "visual calculation",
579        "calculation (auditory and visual cue)": "auditory&visual calculation",
580        "calculation (auditory cue) vs sentence listening":
581            "auditory calculation vs auditory sentences",
582        "calculation (visual cue) vs sentence reading":
583            "visual calculation vs sentences",
584        "calculation vs sentences": "auditory&visual calculation vs sentences",
585        # Calculation + Sentences
586        "calculation (auditory cue) and sentence listening":
587            "auditory processing",
588        "calculation (visual cue) and sentence reading":
589            "visual processing",
590        "calculation (visual cue) and sentence reading vs "
591        "calculation (auditory cue) and sentence listening":
592            "visual processing vs auditory processing",
593        "calculation (auditory cue) and sentence listening vs "
594        "calculation (visual cue) and sentence reading":
595            "auditory processing vs visual processing",
596        "calculation (visual cue) and sentence reading vs checkerboard":
597            "visual processing vs checkerboard",
598        "calculation and sentence listening/reading vs button press":
599            "cognitive processing vs motor",
600        # Button press
601        "left button press (auditory cue)": "left auditory click",
602        "left button press (visual cue)": "left visual click",
603        "left button press": "left auditory&visual click",
604        "left vs right button press": "left auditory & visual click vs "
605            + "right auditory&visual click",
606        "right button press (auditory cue)": "right auditory click",
607        "right button press (visual cue)": "right visual click",
608        "right button press": "right auditory & visual click",
609        "right vs left button press": "right auditory & visual click "
610           + "vs left auditory&visual click",
611        "button press (auditory cue) vs sentence listening":
612            "auditory click vs auditory sentences",
613        "button press (visual cue) vs sentence reading":
614            "visual click vs visual sentences",
615        "button press vs calculation and sentence listening/reading":
616            "auditory&visual motor vs cognitive processing"}
617    allowed_contrasts = list(contrast_name_wrapper.values())
618
619    # convert contrast names
620    contrasts_wrapped = []
621    # get a unique ID for each contrast. It is used to give a unique name to
622    # each download file and avoid name collisions.
623    contrasts_indices = []
624    for contrast in contrasts:
625        if contrast in allowed_contrasts:
626            contrasts_wrapped.append(contrast.title().replace(" ", ""))
627            contrasts_indices.append(allowed_contrasts.index(contrast))
628        elif contrast in contrast_name_wrapper:
629            name = contrast_name_wrapper[contrast]
630            contrasts_wrapped.append(name.title().replace(" ", ""))
631            contrasts_indices.append(allowed_contrasts.index(name))
632        else:
633            raise ValueError("Contrast \'%s\' is not available" % contrast)
634
635    # Get the dataset OSF index
636    dataset_name = "brainomics_localizer"
637    index_url = "https://osf.io/hwbm2/download"
638    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
639                                verbose=verbose)
640    index_file = _fetch_file(index_url, data_dir, verbose=verbose)
641    with open(index_file, "rt") as of:
642        index = json.load(of)
643
644    # Build data URLs that will be fetched
645    files = {}
646    # Download from the relevant OSF project, using hashes generated
647    # from the OSF API. Note the trailing slash. For more info, see:
648    # https://gist.github.com/emdupre/3cb4d564511d495ea6bf89c6a577da74
649    root_url = "https://osf.io/download/{0}/"
650    if isinstance(n_subjects, numbers.Number):
651        subject_mask = np.arange(1, n_subjects + 1)
652    else:
653        subject_mask = np.array(n_subjects)
654    subject_ids = ["S%02d" % s for s in subject_mask]
655    data_types = ["cmaps"]
656    if get_tmaps:
657        data_types.append("tmaps")
658    filenames = []
659
660    def _is_valid_path(path, index, verbose):
661        if path not in index:
662            if verbose > 0:
663                print("Skipping path '{0}'...".format(path))
664            return False
665        return True
666
667    for subject_id in subject_ids:
668        for data_type in data_types:
669            for contrast_id, contrast in enumerate(contrasts_wrapped):
670                name_aux = str.replace(
671                    str.join('_', [data_type, contrast]), ' ', '_')
672                file_path = os.path.join(
673                    "brainomics_data", subject_id, "%s.nii.gz" % name_aux)
674                path = "/".join([
675                    "/localizer", "derivatives", "spm_1st_level",
676                    "sub-%s" % subject_id,
677                    "sub-%s_task-localizer_acq-%s_%s.nii.gz" % (
678                        subject_id, contrast, data_type)])
679                if _is_valid_path(path, index, verbose=verbose):
680                    file_url = root_url.format(index[path][1:])
681                    opts = {"move": file_path}
682                    filenames.append((file_path, file_url, opts))
683                    files.setdefault(data_type, []).append(file_path)
684
685    # Fetch masks if asked by user
686    if get_masks:
687        for subject_id in subject_ids:
688            file_path = os.path.join(
689                "brainomics_data", subject_id, "boolean_mask_mask.nii.gz")
690            path = "/".join([
691                "/localizer", "derivatives", "spm_1st_level",
692                "sub-%s" % subject_id, "sub-%s_mask.nii.gz" % subject_id])
693            if _is_valid_path(path, index, verbose=verbose):
694                file_url = root_url.format(index[path][1:])
695                opts = {"move": file_path}
696                filenames.append((file_path, file_url, opts))
697                files.setdefault("masks", []).append(file_path)
698
699    # Fetch anats if asked by user
700    if get_anats:
701        for subject_id in subject_ids:
702            file_path = os.path.join(
703                "brainomics_data", subject_id,
704                "normalized_T1_anat_defaced.nii.gz")
705            path = "/".join([
706                "/localizer", "derivatives", "spm_preprocessing",
707                "sub-%s" % subject_id, "sub-%s_T1w.nii.gz" % subject_id])
708            if _is_valid_path(path, index, verbose=verbose):
709                file_url = root_url.format(index[path][1:])
710                opts = {"move": file_path}
711                filenames.append((file_path, file_url, opts))
712                files.setdefault("anats", []).append(file_path)
713
714    # Fetch subject characteristics
715    participants_file = os.path.join("brainomics_data", "participants.tsv")
716    path = "/localizer/participants.tsv"
717    if _is_valid_path(path, index, verbose=verbose):
718        file_url = root_url.format(index[path][1:])
719        opts = {"move": participants_file}
720        filenames.append((participants_file, file_url, opts))
721
722    # Fetch behavioural
723    behavioural_file = os.path.join(
724        "brainomics_data", "phenotype", "behavioural.tsv")
725    path = "/localizer/phenotype/behavioural.tsv"
726    if _is_valid_path(path, index, verbose=verbose):
727        file_url = root_url.format(index[path][1:])
728        opts = {"move": behavioural_file}
729        filenames.append((behavioural_file, file_url, opts))
730
731    # Actual data fetching
732    fdescr = _get_dataset_descr(dataset_name)
733    _fetch_files(data_dir, filenames, verbose=verbose)
734    for key, value in files.items():
735        files[key] = [os.path.join(data_dir, val) for val in value]
736
737    # Load covariates file
738    from numpy.lib.recfunctions import join_by
739    participants_file = os.path.join(data_dir, participants_file)
740    csv_data = np.recfromcsv(participants_file, delimiter='\t')
741    behavioural_file = os.path.join(data_dir, behavioural_file)
742    csv_data2 = np.recfromcsv(behavioural_file, delimiter='\t')
743    csv_data = join_by(
744        "participant_id", csv_data, csv_data2, usemask=False, asrecarray=True)
745    subject_names = csv_data["participant_id"].tolist()
746    subjects_indices = []
747    for name in subject_ids:
748        name = name.encode("utf8")
749        if name not in subject_names:
750            continue
751        subjects_indices.append(subject_names.index(name))
752    csv_data = csv_data[subjects_indices]
753
754    return Bunch(ext_vars=csv_data, description=fdescr, **files)
755
756
757@fill_doc
758def fetch_localizer_calculation_task(n_subjects=1, data_dir=None, url=None,
759                                     verbose=1):
760    """Fetch calculation task contrast maps from the localizer.
761
762    Parameters
763    ----------
764    n_subjects : int, optional
765        The number of subjects to load. If None is given,
766        all 94 subjects are used. Default=1.
767    %(data_dir)s
768    %(url)s
769    %(verbose)s
770
771    Returns
772    -------
773    data : Bunch
774        Dictionary-like object, the interest attributes are :
775        'cmaps': string list, giving paths to nifti contrast maps
776
777    Notes
778    ------
779    This function is only a caller for the fetch_localizer_contrasts in order
780    to simplify examples reading and understanding.
781    The 'calculation (auditory and visual cue)' contrast is used.
782
783    See Also
784    ---------
785    nilearn.datasets.fetch_localizer_button_task
786    nilearn.datasets.fetch_localizer_contrasts
787
788    """
789    data = fetch_localizer_contrasts(["calculation (auditory and visual cue)"],
790                                     n_subjects=n_subjects,
791                                     get_tmaps=False, get_masks=False,
792                                     get_anats=False, data_dir=data_dir,
793                                     url=url, resume=True, verbose=verbose)
794    return data
795
796
797@fill_doc
798def fetch_localizer_button_task(data_dir=None, url=None,
799                                verbose=1):
800    """Fetch left vs right button press contrast maps from the localizer.
801
802    Parameters
803    ----------
804    %(data_dir)s
805    %(url)s
806    %(verbose)s
807
808    Returns
809    -------
810    data : Bunch
811        Dictionary-like object, the interest attributes are :
812
813        - 'cmaps': string list, giving paths to nifti contrast maps
814        - 'tmap': string, giving paths to nifti contrast maps
815        - 'anat': string, giving paths to normalized anatomical image
816
817    Notes
818    ------
819    This function is only a caller for the fetch_localizer_contrasts in order
820    to simplify examples reading and understanding.
821    The 'left vs right button press' contrast is used.
822
823    See Also
824    ---------
825    nilearn.datasets.fetch_localizer_calculation_task
826    nilearn.datasets.fetch_localizer_contrasts
827
828    """
829    data = fetch_localizer_contrasts(["left vs right button press"],
830                                     n_subjects=[2],
831                                     get_tmaps=True, get_masks=False,
832                                     get_anats=True, data_dir=data_dir,
833                                     url=url, resume=True, verbose=verbose)
834    # Additional keys for backward compatibility
835    data['tmap'] = data['tmaps'][0]
836    data['anat'] = data['anats'][0]
837    return data
838
839
840@fill_doc
841def fetch_abide_pcp(data_dir=None, n_subjects=None, pipeline='cpac',
842                    band_pass_filtering=False, global_signal_regression=False,
843                    derivatives=['func_preproc'],
844                    quality_checked=True, url=None, verbose=1, **kwargs):
845    """Fetch ABIDE dataset.
846
847    Fetch the Autism Brain Imaging Data Exchange (ABIDE) dataset wrt criteria
848    that can be passed as parameter. Note that this is the preprocessed
849    version of ABIDE provided by the preprocess connectome projects (PCP).
850    See :footcite:`Nielsen2013Multisite`.
851
852    Parameters
853    ----------
854    %(data_dir)s
855    n_subjects : int, optional
856        The number of subjects to load. If None is given,
857        all available subjects are used (this number depends on the
858        preprocessing pipeline used).
859
860    pipeline : string {'cpac', 'css', 'dparsf', 'niak'}, optional
861        Possible pipelines are "ccs", "cpac", "dparsf" and "niak".
862        Default='cpac'.
863
864    band_pass_filtering : boolean, optional
865        Due to controversies in the literature, band pass filtering is
866        optional. If true, signal is band filtered between 0.01Hz and 0.1Hz.
867        Default=False.
868
869    global_signal_regression : boolean optional
870        Indicates if global signal regression should be applied on the
871        signals. Default=False.
872
873    derivatives : string list, optional
874        Types of downloaded files. Possible values are: alff, degree_binarize,
875        degree_weighted, dual_regression, eigenvector_binarize,
876        eigenvector_weighted, falff, func_mask, func_mean, func_preproc, lfcd,
877        reho, rois_aal, rois_cc200, rois_cc400, rois_dosenbach160, rois_ez,
878        rois_ho, rois_tt, and vmhc. Please refer to the PCP site for more
879        details. Default=['func_preproc'].
880
881    quality_checked : boolean, optional
882        If true (default), restrict the list of the subjects to the one that
883        passed quality assessment for all raters. Default=True.
884    %(url)s
885    %(verbose)s
886    kwargs : parameter list, optional
887        Any extra keyword argument will be used to filter downloaded subjects
888        according to the CSV phenotypic file. Some examples of filters are
889        indicated below.
890
891    SUB_ID : list of integers in [50001, 50607], optional
892        Ids of the subjects to be loaded.
893
894    DX_GROUP : integer in {1, 2}, optional
895        1 is autism, 2 is control.
896
897    DSM_IV_TR : integer in [0, 4], optional
898        O is control, 1 is autism, 2 is Asperger, 3 is PPD-NOS,
899        4 is Asperger or PPD-NOS.
900
901    AGE_AT_SCAN : float in [6.47, 64], optional
902        Age of the subject.
903
904    SEX : integer in {1, 2}, optional
905        1 is male, 2 is female.
906
907    HANDEDNESS_CATEGORY : string in {'R', 'L', 'Mixed', 'Ambi'}, optional
908        R = Right, L = Left, Ambi = Ambidextrous.
909
910    HANDEDNESS_SCORE : integer in [-100, 100], optional
911        Positive = Right, Negative = Left, 0 = Ambidextrous.
912
913    Notes
914    -----
915    Code and description of preprocessing pipelines are provided on the
916    `PCP website <http://preprocessed-connectomes-project.github.io/>`.
917
918    References
919    ----------
920    .. footbibliography::
921
922    """
923    # People keep getting it wrong and submiting a string instead of a
924    # list of strings. We'll make their life easy
925    if isinstance(derivatives, str):
926        derivatives = [derivatives, ]
927
928    # Parameter check
929    for derivative in derivatives:
930        if derivative not in [
931                'alff', 'degree_binarize', 'degree_weighted',
932                'dual_regression', 'eigenvector_binarize',
933                'eigenvector_weighted', 'falff', 'func_mask', 'func_mean',
934                'func_preproc', 'lfcd', 'reho', 'rois_aal', 'rois_cc200',
935                'rois_cc400', 'rois_dosenbach160', 'rois_ez', 'rois_ho',
936                'rois_tt', 'vmhc']:
937            raise KeyError('%s is not a valid derivative' % derivative)
938
939    strategy = ''
940    if not band_pass_filtering:
941        strategy += 'no'
942    strategy += 'filt_'
943    if not global_signal_regression:
944        strategy += 'no'
945    strategy += 'global'
946
947    # General file: phenotypic information
948    dataset_name = 'ABIDE_pcp'
949    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
950                                verbose=verbose)
951    if url is None:
952        url = ('https://s3.amazonaws.com/fcp-indi/data/Projects/'
953               'ABIDE_Initiative')
954
955    if quality_checked:
956        kwargs['qc_rater_1'] = b'OK'
957        kwargs['qc_anat_rater_2'] = [b'OK', b'maybe']
958        kwargs['qc_func_rater_2'] = [b'OK', b'maybe']
959        kwargs['qc_anat_rater_3'] = b'OK'
960        kwargs['qc_func_rater_3'] = b'OK'
961
962    # Fetch the phenotypic file and load it
963    csv = 'Phenotypic_V1_0b_preprocessed1.csv'
964    path_csv = _fetch_files(data_dir, [(csv, url + '/' + csv, {})],
965                            verbose=verbose)[0]
966
967    # Note: the phenotypic file contains string that contains comma which mess
968    # up numpy array csv loading. This is why I do a pass to remove the last
969    # field. This can be
970    # done simply with pandas but we don't want such dependency ATM
971    # pheno = pandas.read_csv(path_csv).to_records()
972    with open(path_csv, 'r') as pheno_f:
973        pheno = ['i' + pheno_f.readline()]
974
975        # This regexp replaces commas between double quotes
976        for line in pheno_f:
977            pheno.append(re.sub(r',(?=[^"]*"(?:[^"]*"[^"]*")*[^"]*$)', ";", line))
978
979    # bytes (encode()) needed for python 2/3 compat with numpy
980    pheno = '\n'.join(pheno).encode()
981    pheno = BytesIO(pheno)
982    pheno = np.recfromcsv(pheno, comments='$', case_sensitive=True)
983
984    # First, filter subjects with no filename
985    pheno = pheno[pheno['FILE_ID'] != b'no_filename']
986    # Apply user defined filters
987    user_filter = _filter_columns(pheno, kwargs)
988    pheno = pheno[user_filter]
989
990    # Go into specific data folder and url
991    data_dir = os.path.join(data_dir, pipeline, strategy)
992    url = '/'.join([url, 'Outputs', pipeline, strategy])
993
994    # Get the files
995    results = {}
996    file_ids = [file_id.decode() for file_id in pheno['FILE_ID']]
997    if n_subjects is not None:
998        file_ids = file_ids[:n_subjects]
999        pheno = pheno[:n_subjects]
1000
1001    results['description'] = _get_dataset_descr(dataset_name)
1002    results['phenotypic'] = pheno
1003    for derivative in derivatives:
1004        ext = '.1D' if derivative.startswith('rois') else '.nii.gz'
1005        files = []
1006        for file_id in file_ids:
1007            file_ = [(
1008                file_id + '_' + derivative + ext,
1009                '/'.join([url, derivative, file_id + '_' + derivative + ext]),
1010                {}
1011            )]
1012            files.append(_fetch_files(data_dir, file_, verbose=verbose)[0])
1013        # Load derivatives if needed
1014        if ext == '.1D':
1015            files = [np.loadtxt(f) for f in files]
1016        results[derivative] = files
1017    return Bunch(**results)
1018
1019
1020def _load_mixed_gambles(zmap_imgs):
1021    """Ravel zmaps (one per subject) along time axis, resulting,
1022    in a n_subjects * n_trials 3D niimgs and, and then make
1023    gain vector y of same length.
1024    """
1025    X = []
1026    y = []
1027    mask = []
1028    for zmap_img in zmap_imgs:
1029        # load subject data
1030        this_X = get_data(zmap_img)
1031        affine = zmap_img.affine
1032        finite_mask = np.all(np.isfinite(this_X), axis=-1)
1033        this_mask = np.logical_and(np.all(this_X != 0, axis=-1),
1034                                   finite_mask)
1035        this_y = np.array([np.arange(1, 9)] * 6).ravel()
1036
1037        # gain levels
1038        if len(this_y) != this_X.shape[-1]:
1039            raise RuntimeError("%s: Expecting %i volumes, got %i!" % (
1040                zmap_img, len(this_y), this_X.shape[-1]))
1041
1042        # standardize subject data
1043        this_X -= this_X.mean(axis=-1)[..., np.newaxis]
1044        std = this_X.std(axis=-1)
1045        std[std == 0] = 1
1046        this_X /= std[..., np.newaxis]
1047
1048        # commit subject data
1049        X.append(this_X)
1050        y.extend(this_y)
1051        mask.append(this_mask)
1052    y = np.array(y)
1053    X = np.concatenate(X, axis=-1)
1054    mask = np.sum(mask, axis=0) > .5 * len(mask)
1055    mask = np.logical_and(mask, np.all(np.isfinite(X), axis=-1))
1056    X = X[mask, :].T
1057    tmp = np.zeros(list(mask.shape) + [len(X)])
1058    tmp[mask, :] = X.T
1059    mask_img = nibabel.Nifti1Image(mask.astype(int), affine)
1060    X = nibabel.four_to_three(nibabel.Nifti1Image(tmp, affine))
1061    return X, y, mask_img
1062
1063
1064@fill_doc
1065def fetch_mixed_gambles(n_subjects=1, data_dir=None, url=None, resume=True,
1066                        return_raw_data=False, verbose=1):
1067    """Fetch Jimura "mixed gambles" dataset.
1068
1069    See :footcite:`JIMURA2012544`.
1070
1071    Parameters
1072    ----------
1073    n_subjects : int, optional
1074        The number of subjects to load. If None is given, all the
1075        subjects are used. Default=1.
1076    %(data_dir)s
1077    %(url)s
1078    %(resume)s
1079    %(verbose)s
1080    return_raw_data : bool, optional
1081        If false, then the data will transformed into and (X, y) pair, suitable
1082        for machine learning routines. X is a list of n_subjects * 48
1083        Nifti1Image objects (where 48 is the number of trials),
1084        and y is an array of shape (n_subjects * 48,).
1085        Default=False.
1086
1087    Returns
1088    -------
1089    data : Bunch
1090        Dictionary-like object, the interest attributes are :
1091        'zmaps': string list
1092            Paths to realigned gain betamaps (one nifti per subject).
1093        'gain': ..
1094            If make_Xy is true, this is a list of n_subjects * 48
1095            Nifti1Image objects, else it is None.
1096        'y': array of shape (n_subjects * 48,) or None
1097            If make_Xy is true, then this is an array of shape
1098            (n_subjects * 48,), else it is None.
1099
1100    References
1101    ----------
1102    .. footbibliography::
1103
1104    """
1105    if n_subjects > 16:
1106        warnings.warn('Warning: there are only 16 subjects!')
1107        n_subjects = 16
1108    if url is None:
1109        url = ("https://www.nitrc.org/frs/download.php/7229/"
1110               "jimura_poldrack_2012_zmaps.zip")
1111    opts = dict(uncompress=True)
1112    files = [("zmaps%ssub%03i_zmaps.nii.gz" % (os.sep, (j + 1)), url, opts)
1113             for j in range(n_subjects)]
1114    data_dir = _get_dataset_dir('jimura_poldrack_2012_zmaps',
1115                                data_dir=data_dir)
1116    zmap_fnames = _fetch_files(data_dir, files, resume=resume, verbose=verbose)
1117    subject_id = np.repeat(np.arange(n_subjects), 6 * 8)
1118    data = Bunch(zmaps=zmap_fnames,
1119                 subject_id=subject_id)
1120    if not return_raw_data:
1121        X, y, mask_img = _load_mixed_gambles(check_niimg(data.zmaps,
1122                                                         return_iterator=True))
1123        data.zmaps, data.gain, data.mask_img = X, y, mask_img
1124    return data
1125
1126
1127@fill_doc
1128def fetch_megatrawls_netmats(dimensionality=100, timeseries='eigen_regression',
1129                             matrices='partial_correlation', data_dir=None,
1130                             resume=True, verbose=1):
1131    """Downloads and returns Network Matrices data from MegaTrawls release in HCP.
1132
1133    This data can be used to predict relationships between imaging data and
1134    non-imaging behavioural measures such as age, sex, education, etc.
1135    The network matrices are estimated from functional connectivity
1136    datasets of 461 subjects. Full technical details in references.
1137
1138    More information available in :footcite:`smithhcp2015`,
1139    :footcite:`smith2015positive`, :footcite:`Filippini7209`,
1140    :footcite:`smith2014methods`, and :footcite:`reilly2009cerebellum`.
1141
1142    Parameters
1143    ----------
1144    dimensionality : int, optional
1145        Valid inputs are 25, 50, 100, 200, 300. By default, network matrices
1146        estimated using Group ICA brain parcellations of 100 components/dimensions
1147        will be returned. Default=100.
1148
1149    timeseries : str, optional
1150        Valid inputs are 'multiple_spatial_regression' or 'eigen_regression'. By
1151        default 'eigen_regression', matrices estimated using first principal
1152        eigen component timeseries signals extracted from each subject data
1153        parcellations will be returned. Otherwise, 'multiple_spatial_regression'
1154        matrices estimated using spatial regressor based timeseries signals
1155        extracted from each subject data parcellations will be returned.
1156        Default='eigen_regression'.
1157
1158    matrices : str, optional
1159        Valid inputs are 'full_correlation' or 'partial_correlation'. By default,
1160        partial correlation matrices will be returned otherwise if selected
1161        full correlation matrices will be returned.
1162        Default='partial_correlation'.
1163    %(data_dir)s
1164    %(resume)s
1165    %(verbose)s
1166
1167    Returns
1168    -------
1169    data : Bunch
1170        Dictionary-like object, the attributes are :
1171
1172        - 'dimensions': int, consists of given input in dimensions.
1173
1174        - 'timeseries': str, consists of given input in timeseries method.
1175
1176        - 'matrices': str, consists of given type of specific matrices.
1177
1178        - 'correlation_matrices': ndarray, consists of correlation matrices
1179          based on given type of matrices. Array size will depend on given
1180          dimensions (n, n).
1181
1182        - 'description': data description
1183
1184    References
1185    ----------
1186    .. footbibliography::
1187
1188    Notes
1189    -----
1190    See description for terms & conditions on data usage.
1191
1192    """
1193    url = "http://www.nitrc.org/frs/download.php/8037/Megatrawls.tgz"
1194    opts = {'uncompress': True}
1195
1196    error_message = "Invalid {0} input is provided: {1}, choose one of them {2}"
1197    # standard dataset terms
1198    dimensionalities = [25, 50, 100, 200, 300]
1199    if dimensionality not in dimensionalities:
1200        raise ValueError(error_message.format('dimensionality', dimensionality,
1201                                              dimensionalities))
1202    timeseries_methods = ['multiple_spatial_regression', 'eigen_regression']
1203    if timeseries not in timeseries_methods:
1204        raise ValueError(error_message.format('timeseries', timeseries,
1205                                              timeseries_methods))
1206    output_matrices_names = ['full_correlation', 'partial_correlation']
1207    if matrices not in output_matrices_names:
1208        raise ValueError(error_message.format('matrices', matrices,
1209                                              output_matrices_names))
1210
1211    dataset_name = 'Megatrawls'
1212    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose)
1213    description = _get_dataset_descr(dataset_name)
1214
1215    timeseries_map = dict(multiple_spatial_regression='ts2', eigen_regression='ts3')
1216    matrices_map = dict(full_correlation='Znet1.txt', partial_correlation='Znet2.txt')
1217    filepath = [(os.path.join(
1218        '3T_Q1-Q6related468_MSMsulc_d%d_%s' % (dimensionality, timeseries_map[timeseries]),
1219        matrices_map[matrices]), url, opts)]
1220
1221    # Fetch all the files
1222    files = _fetch_files(data_dir, filepath, resume=resume, verbose=verbose)
1223
1224    # Load the files into arrays
1225    correlation_matrices = csv_to_array(files[0])
1226
1227    return Bunch(
1228        dimensions=dimensionality,
1229        timeseries=timeseries,
1230        matrices=matrices,
1231        correlation_matrices=correlation_matrices,
1232        description=description)
1233
1234
1235@fill_doc
1236@deprecated("'fetch_cobre' has been deprecated and will be removed "
1237            "in release 0.9 . "
1238            "Please consider using a different datasets or downloading it "
1239            "with a different tool than nilearn.")
1240def fetch_cobre(n_subjects=10, data_dir=None, url=None, verbose=1):
1241    """Fetch COBRE datasets preprocessed using NIAK 0.17 under CentOS
1242    version 6.3 with Octave version 4.0.2 and the Minc toolkit version 0.3.18.
1243
1244    Downloads and returns COBRE preprocessed resting state fMRI datasets,
1245    covariates and phenotypic information such as demographic, clinical
1246    variables, measure of frame displacement FD (an average FD for all the time
1247    frames left after censoring).
1248
1249    Each subject `fmri_XXXXXXX.nii.gz` is a 3D+t nifti volume (150 volumes).
1250    WARNING: no confounds were actually regressed from the data, so it can be
1251    done interactively by the user who will be able to explore different
1252    analytical paths easily.
1253
1254    For each subject, there is `fmri_XXXXXXX.tsv` files which contains the
1255    covariates such as motion parameters, mean CSF signal that should to be
1256    regressed out of the functional data.
1257
1258    `keys_confounds.json`: a json file, that describes each variable mentioned
1259    in the files `fmri_XXXXXXX.tsv.gz`. It also contains a list of time frames
1260    that have been removed from the time series by censoring for high motion.
1261
1262    `phenotypic_data.tsv` contains the data of clinical variables that
1263    explained in `keys_phenotypic_data.json`
1264
1265    .. versionadded:: 0.3
1266
1267    Warnings
1268    --------
1269    'fetch_cobre' has been deprecated and will be removed in release 0.9.
1270
1271    Parameters
1272    ----------
1273    n_subjects : int, optional
1274        The number of subjects to load from maximum of 146 subjects.
1275        By default, 10 subjects will be loaded. If n_subjects=None,
1276        all subjects will be loaded. Default=10.
1277    %(data_dir)s
1278    %(url)s
1279    %(verbose)s
1280
1281    Returns
1282    -------
1283    data : Bunch
1284        Dictionary-like object, the attributes are:
1285
1286        - 'func': string list
1287            Paths to Nifti images.
1288        - 'confounds': string list
1289            Paths to .tsv files of each subject, confounds.
1290        - 'phenotypic': numpy.recarray
1291            Contains data of clinical variables, sex, age, FD.
1292        - 'description': data description of the release and references.
1293        - 'desc_con': str
1294            description of the confounds variables
1295        - 'desc_phenotypic': str
1296            description of the phenotypic variables.
1297
1298    Notes
1299    -----
1300    See `more information about datasets structure
1301    <https://figshare.com/articles/COBRE_preprocessed_with_NIAK_0_17_-_lightweight_release/4197885>`_
1302
1303    """
1304    if url is None:
1305        # Here we use the file that provides URL for all others
1306        url = 'https://api.figshare.com/v2/articles/4197885'
1307    dataset_name = 'cobre'
1308    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
1309                                verbose=verbose)
1310    fdescr = _get_dataset_descr(dataset_name)
1311
1312    # First, fetch the file that references all individual URLs
1313    files = _fetch_files(data_dir, [("4197885", url, {})],
1314                         verbose=verbose)[0]
1315
1316    files = json.load(open(files, 'r'))
1317    files = files['files']
1318    # Index files by name
1319    files_ = {}
1320    for f in files:
1321        files_[f['name']] = f
1322    files = files_
1323
1324    # Fetch the phenotypic file and load it
1325    csv_name_gz = 'phenotypic_data.tsv.gz'
1326    csv_name = os.path.splitext(csv_name_gz)[0]
1327    csv_file_phen = _fetch_files(
1328        data_dir, [(csv_name, files[csv_name_gz]['download_url'],
1329                    {'md5': files[csv_name_gz].get('md5', None),
1330                     'move': csv_name_gz,
1331                     'uncompress': True})],
1332        verbose=verbose)[0]
1333
1334    # Load file in filename to numpy arrays
1335    names = ['ID', 'Current Age', 'Gender', 'Handedness', 'Subject Type',
1336             'Diagnosis', 'Frames OK', 'FD', 'FD Scrubbed']
1337
1338    csv_array_phen = np.recfromcsv(csv_file_phen, names=names,
1339                                   skip_header=True, delimiter='\t')
1340
1341    # Check number of subjects
1342    max_subjects = len(csv_array_phen)
1343    if n_subjects is None:
1344        n_subjects = max_subjects
1345
1346    if n_subjects > max_subjects:
1347        warnings.warn('Warning: there are only %d subjects' % max_subjects)
1348        n_subjects = max_subjects
1349
1350    sz_count = list(csv_array_phen['subject_type']).count(b'Patient')
1351    ct_count = list(csv_array_phen['subject_type']).count(b'Control')
1352
1353    n_sz = np.round(float(n_subjects) / max_subjects * sz_count).astype(int)
1354    n_ct = np.round(float(n_subjects) / max_subjects * ct_count).astype(int)
1355
1356    # First, restrict the csv files to the adequate number of subjects
1357    sz_ids = csv_array_phen[csv_array_phen['subject_type'] ==
1358                            b'Patient']['id'][:n_sz]
1359    ct_ids = csv_array_phen[csv_array_phen['subject_type'] ==
1360                            b'Control']['id'][:n_ct]
1361    ids = np.hstack([sz_ids, ct_ids])
1362    csv_array_phen = csv_array_phen[np.in1d(csv_array_phen['id'], ids)]
1363
1364    # Call fetch_files once per subject.
1365
1366    func = []
1367    con = []
1368    for i in ids:
1369        f = 'fmri_00' + str(i) + '.nii.gz'
1370        c_gz = 'fmri_00' + str(i) + '.tsv.gz'
1371        c = os.path.splitext(c_gz)[0]
1372
1373        f, c = _fetch_files(
1374            data_dir,
1375            [(f, files[f]['download_url'], {'md5': files[f].get('md5', None),
1376                                            'move': f}),
1377             (c, files[c_gz]['download_url'],
1378              {'md5': files[c_gz].get('md5', None),
1379               'move': c_gz, 'uncompress': True})
1380             ],
1381            verbose=verbose)
1382        func.append(f)
1383        con.append(c)
1384
1385    # Fetch the the complementary files
1386    keys_con = "keys_confounds.json"
1387    keys_phen = "keys_phenotypic_data.json"
1388
1389    csv_keys_con, csv_keys_phen = _fetch_files(
1390        data_dir,
1391        [(keys_con, files[keys_con]['download_url'],
1392          {'md5': files[keys_con].get('md5', None), 'move': keys_con}),
1393         (keys_phen, files[keys_phen]['download_url'],
1394         {'md5': files[keys_phen].get('md5', None), 'move': keys_phen})
1395         ],
1396        verbose=verbose)
1397
1398    files_keys_con = open(csv_keys_con, 'r').read()
1399    files_keys_phen = open(csv_keys_phen, 'r').read()
1400
1401    return Bunch(func=func, confounds=con, phenotypic=csv_array_phen,
1402                 description=fdescr, desc_con=files_keys_con,
1403                 desc_phenotypic=files_keys_phen)
1404
1405
1406@fill_doc
1407def fetch_surf_nki_enhanced(n_subjects=10, data_dir=None,
1408                            url=None, resume=True, verbose=1):
1409    """Download and load the NKI enhanced resting-state dataset,
1410    preprocessed and projected to the fsaverage5 space surface.
1411
1412    See :footcite:`Nooner2012NKI`.
1413
1414    Direct download link :footcite:`NKIdataset`.
1415
1416    .. versionadded:: 0.3
1417
1418    Parameters
1419    ----------
1420    n_subjects : int, optional
1421        The number of subjects to load from maximum of 102 subjects.
1422        By default, 10 subjects will be loaded. If None is given,
1423        all 102 subjects will be loaded. Default=10.
1424    %(data_dir)s
1425    %(url)s
1426    %(resume)s
1427    %(verbose)s
1428
1429    Returns
1430    -------
1431    data : sklearn.datasets.base.Bunch
1432        Dictionary-like object, the interest attributes are :
1433
1434        - 'func_left': Paths to Gifti files containing resting state
1435                        time series left hemisphere
1436        - 'func_right': Paths to Gifti files containing resting state
1437                         time series right hemisphere
1438        - 'phenotypic': array containing tuple with subject ID, age,
1439                         dominant hand and sex for each subject.
1440        - 'description': data description of the release and references.
1441
1442    References
1443    ----------
1444    .. footbibliography::
1445
1446    """
1447    if url is None:
1448        url = 'https://www.nitrc.org/frs/download.php/'
1449
1450    # Preliminary checks and declarations
1451    dataset_name = 'nki_enhanced_surface'
1452    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
1453                                verbose=verbose)
1454    ids = ['A00028185', 'A00033747', 'A00035072', 'A00035827', 'A00035840',
1455           'A00037112', 'A00037511', 'A00038998', 'A00039391', 'A00039431',
1456           'A00039488', 'A00040524', 'A00040623', 'A00040944', 'A00043299',
1457           'A00043520', 'A00043677', 'A00043722', 'A00045589', 'A00050998',
1458           'A00051063', 'A00051064', 'A00051456', 'A00051457', 'A00051477',
1459           'A00051513', 'A00051514', 'A00051517', 'A00051528', 'A00051529',
1460           'A00051539', 'A00051604', 'A00051638', 'A00051658', 'A00051676',
1461           'A00051678', 'A00051679', 'A00051726', 'A00051774', 'A00051796',
1462           'A00051835', 'A00051882', 'A00051925', 'A00051927', 'A00052070',
1463           'A00052117', 'A00052118', 'A00052126', 'A00052180', 'A00052197',
1464           'A00052214', 'A00052234', 'A00052307', 'A00052319', 'A00052499',
1465           'A00052502', 'A00052577', 'A00052612', 'A00052639', 'A00053202',
1466           'A00053369', 'A00053456', 'A00053474', 'A00053546', 'A00053576',
1467           'A00053577', 'A00053578', 'A00053625', 'A00053626', 'A00053627',
1468           'A00053874', 'A00053901', 'A00053927', 'A00053949', 'A00054038',
1469           'A00054153', 'A00054173', 'A00054358', 'A00054482', 'A00054532',
1470           'A00054533', 'A00054534', 'A00054621', 'A00054895', 'A00054897',
1471           'A00054913', 'A00054929', 'A00055061', 'A00055215', 'A00055352',
1472           'A00055353', 'A00055542', 'A00055738', 'A00055763', 'A00055806',
1473           'A00056097', 'A00056098', 'A00056164', 'A00056372', 'A00056452',
1474           'A00056489', 'A00056949']
1475
1476    nitrc_ids = range(8260, 8464)
1477    max_subjects = len(ids)
1478    if n_subjects is None:
1479        n_subjects = max_subjects
1480    if n_subjects > max_subjects:
1481        warnings.warn('Warning: there are only %d subjects' % max_subjects)
1482        n_subjects = max_subjects
1483    ids = ids[:n_subjects]
1484
1485    # Dataset description
1486    fdescr = _get_dataset_descr(dataset_name)
1487
1488    # First, get the metadata
1489    phenotypic_file = 'NKI_enhanced_surface_phenotypics.csv'
1490    phenotypic = (phenotypic_file, url + '8470/pheno_nki_nilearn.csv',
1491                  {'move': phenotypic_file})
1492
1493    phenotypic = _fetch_files(data_dir, [phenotypic], resume=resume,
1494                              verbose=verbose)[0]
1495
1496    # Load the csv file
1497    phenotypic = np.genfromtxt(phenotypic, skip_header=True,
1498                               names=['Subject', 'Age',
1499                                      'Dominant Hand', 'Sex'],
1500                               delimiter=',', dtype=['U9', '<f8',
1501                                                     'U1', 'U1'])
1502
1503    # Keep phenotypic information for selected subjects
1504    int_ids = np.asarray(ids)
1505    phenotypic = phenotypic[[np.where(phenotypic['Subject'] == i)[0][0]
1506                             for i in int_ids]]
1507
1508    # Download subjects' datasets
1509    func_right = []
1510    func_left = []
1511    for i in range(len(ids)):
1512
1513        archive = url + '%i/%s_%s_preprocessed_fsaverage5_fwhm6.gii'
1514        func = os.path.join('%s', '%s_%s_preprocessed_fwhm6.gii')
1515        rh = _fetch_files(data_dir,
1516                          [(func % (ids[i], ids[i], 'right'),
1517                           archive % (nitrc_ids[2*i+1], ids[i], 'rh'),
1518                           {'move': func % (ids[i], ids[i], 'right')}
1519                            )],
1520                          resume=resume, verbose=verbose)
1521        lh = _fetch_files(data_dir,
1522                          [(func % (ids[i], ids[i], 'left'),
1523                           archive % (nitrc_ids[2*i], ids[i], 'lh'),
1524                           {'move': func % (ids[i], ids[i], 'left')}
1525                            )],
1526                          resume=resume, verbose=verbose)
1527
1528        func_right.append(rh[0])
1529        func_left.append(lh[0])
1530
1531    return Bunch(func_left=func_left, func_right=func_right,
1532                 phenotypic=phenotypic,
1533                 description=fdescr)
1534
1535
1536@fill_doc
1537def _fetch_development_fmri_participants(data_dir, url, verbose):
1538    """Helper function to fetch_development_fmri.
1539
1540    This function helps in downloading and loading participants data from .tsv
1541    uploaded on Open Science Framework (OSF).
1542
1543    The original .tsv file contains many columns but this function picks only
1544    those columns that are relevant.
1545
1546    Parameters
1547    ----------
1548    %(data_dir)s
1549    %(url)s
1550    %(verbose)s
1551
1552    Returns
1553    -------
1554    participants : numpy.ndarray
1555        Contains data of each subject age, age group, child or adult,
1556        gender, handedness.
1557
1558    """
1559    dataset_name = 'development_fmri'
1560    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
1561                                verbose=verbose)
1562
1563    if url is None:
1564        url = 'https://osf.io/yr3av/download'
1565
1566    files = [('participants.tsv', url, {'move': 'participants.tsv'})]
1567    path_to_participants = _fetch_files(data_dir, files, verbose=verbose)[0]
1568
1569    # Load path to participants
1570    dtype = [('participant_id', 'U12'), ('Age', '<f8'), ('AgeGroup', 'U6'),
1571             ('Child_Adult', 'U5'), ('Gender', 'U4'), ('Handedness', 'U4')]
1572    names = ['participant_id', 'Age', 'AgeGroup', 'Child_Adult', 'Gender',
1573             'Handedness']
1574    participants = csv_to_array(path_to_participants, skip_header=True,
1575                                dtype=dtype, names=names)
1576    return participants
1577
1578
1579@fill_doc
1580def _fetch_development_fmri_functional(participants, data_dir, url, resume,
1581                                       verbose):
1582    """Helper function to fetch_development_fmri.
1583
1584    This function helps in downloading functional MRI data in Nifti
1585    and its confound corresponding to each subject.
1586
1587    The files are downloaded from Open Science Framework (OSF).
1588
1589    Parameters
1590    ----------
1591    participants : numpy.ndarray
1592        Should contain column participant_id which represents subjects id. The
1593        number of files are fetched based on ids in this column.
1594    %(data_dir)s
1595    %(url)s
1596    %(resume)s
1597    %(verbose)s
1598
1599    Returns
1600    -------
1601    func : list of str (Nifti files)
1602        Paths to functional MRI data (4D) for each subject.
1603
1604    regressors : list of str (tsv files)
1605        Paths to regressors related to each subject.
1606
1607    """
1608    dataset_name = 'development_fmri'
1609    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
1610                                verbose=verbose)
1611
1612    if url is None:
1613        # Download from the relevant OSF project, using hashes generated
1614        # from the OSF API. Note the trailing slash. For more info, see:
1615        # https://gist.github.com/emdupre/3cb4d564511d495ea6bf89c6a577da74
1616        url = 'https://osf.io/download/{}/'
1617
1618    confounds = '{}_task-pixar_desc-confounds_regressors.tsv'
1619    func = '{0}_task-pixar_space-MNI152NLin2009cAsym_desc-preproc_bold.nii.gz'
1620
1621    # The gzip contains unique download keys per Nifti file and confound
1622    # pre-extracted from OSF. Required for downloading files.
1623    package_directory = os.path.dirname(os.path.abspath(__file__))
1624    dtype = [('participant_id', 'U12'), ('key_regressor', 'U24'),
1625             ('key_bold', 'U24')]
1626    names = ['participant_id', 'key_r', 'key_b']
1627    # csv file contains download information related to OpenScience(osf)
1628    osf_data = csv_to_array(os.path.join(package_directory, "data",
1629                                         "development_fmri.csv"),
1630                            skip_header=True, dtype=dtype, names=names)
1631
1632    funcs = []
1633    regressors = []
1634
1635    for participant_id in participants['participant_id']:
1636        this_osf_id = osf_data[osf_data['participant_id'] == participant_id]
1637        # Download regressors
1638        confound_url = url.format(this_osf_id['key_r'][0])
1639        regressor_file = [(confounds.format(participant_id),
1640                           confound_url,
1641                           {'move': confounds.format(participant_id)})]
1642        path_to_regressor = _fetch_files(data_dir, regressor_file,
1643                                         verbose=verbose)[0]
1644        regressors.append(path_to_regressor)
1645        # Download bold images
1646        func_url = url.format(this_osf_id['key_b'][0])
1647        func_file = [(func.format(participant_id, participant_id), func_url,
1648                      {'move': func.format(participant_id)})]
1649        path_to_func = _fetch_files(data_dir, func_file, resume=resume,
1650                                    verbose=verbose)[0]
1651        funcs.append(path_to_func)
1652    return funcs, regressors
1653
1654
1655@fill_doc
1656def fetch_development_fmri(n_subjects=None, reduce_confounds=True,
1657                           data_dir=None, resume=True, verbose=1,
1658                           age_group='both'):
1659    """Fetch movie watching based brain development dataset (fMRI)
1660
1661    The data is downsampled to 4mm resolution for convenience with a repetition time (TR)
1662    of 2 secs. The origin of the data is coming from OpenNeuro. See Notes below.
1663
1664    Please cite :footcite:`richardson2018development`
1665    if you are using this dataset.
1666
1667    .. versionadded:: 0.5.2
1668
1669    Parameters
1670    ----------
1671    n_subjects : int, optional
1672        The number of subjects to load. If None, all the subjects are
1673        loaded. Total 155 subjects.
1674
1675    reduce_confounds : bool, optional
1676        If True, the returned confounds only include 6 motion parameters,
1677        mean framewise displacement, signal from white matter, csf, and
1678        6 anatomical compcor parameters. This selection only serves the
1679        purpose of having realistic examples. Depending on your research
1680        question, other confounds might be more appropriate.
1681        If False, returns all fmriprep confounds.
1682        Default=True.
1683    %(data_dir)s
1684    %(resume)s
1685    %(verbose)s
1686    age_group : str, optional
1687        Default='both'. Which age group to fetch
1688
1689        - 'adults' = fetch adults only (n=33, ages 18-39)
1690        - 'child' = fetch children only (n=122, ages 3-12)
1691        - 'both' = fetch full sample (n=155)
1692
1693    Returns
1694    -------
1695    data : Bunch
1696        Dictionary-like object, the interest attributes are :
1697
1698        - 'func': list of str (Nifti files)
1699            Paths to downsampled functional MRI data (4D) for each subject.
1700
1701        - 'confounds': list of str (tsv files)
1702            Paths to confounds related to each subject.
1703
1704        - 'phenotypic': numpy.ndarray
1705            Contains each subject age, age group, child or adult, gender,
1706            handedness.
1707
1708    Notes
1709    -----
1710    The original data is downloaded from OpenNeuro
1711    https://openneuro.org/datasets/ds000228/versions/1.0.0
1712
1713    This fetcher downloads downsampled data that are available on Open
1714    Science Framework (OSF). Located here: https://osf.io/5hju4/files/
1715
1716    Preprocessing details: https://osf.io/wjtyq/
1717
1718    Note that if n_subjects > 2, and age_group is 'both',
1719    fetcher will return a ratio of children and adults representative
1720    of the total sample.
1721
1722    References
1723    ----------
1724    .. footbibliography::
1725
1726    """
1727    dataset_name = 'development_fmri'
1728    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
1729                                verbose=1)
1730    keep_confounds = ['trans_x', 'trans_y', 'trans_z', 'rot_x', 'rot_y',
1731                      'rot_z', 'framewise_displacement', 'a_comp_cor_00',
1732                      'a_comp_cor_01', 'a_comp_cor_02', 'a_comp_cor_03',
1733                      'a_comp_cor_04', 'a_comp_cor_05', 'csf',
1734                      'white_matter']
1735
1736    # Dataset description
1737    fdescr = _get_dataset_descr(dataset_name)
1738
1739    # Participants data: ids, demographics, etc
1740    participants = _fetch_development_fmri_participants(data_dir=data_dir,
1741                                                        url=None,
1742                                                        verbose=verbose)
1743
1744    adult_count, child_count = _filter_func_regressors_by_participants(
1745            participants, age_group)  # noqa: E126
1746    max_subjects = adult_count + child_count
1747
1748    n_subjects = _set_invalid_n_subjects_to_max(n_subjects,
1749                                                max_subjects,
1750                                                age_group)
1751
1752    # To keep the proportion of children versus adults
1753    percent_total = float(n_subjects) / max_subjects
1754    n_child = np.round(percent_total * child_count).astype(int)
1755    n_adult = np.round(percent_total * adult_count).astype(int)
1756
1757    # We want to return adults by default (i.e., `age_group=both`) or
1758    # if explicitly requested.
1759    if (age_group != 'child') and (n_subjects == 1):
1760        n_adult, n_child = 1, 0
1761
1762    if (age_group == 'both') and (n_subjects == 2):
1763        n_adult, n_child = 1, 1
1764
1765    participants = _filter_csv_by_n_subjects(participants, n_adult, n_child)
1766
1767    funcs, regressors = _fetch_development_fmri_functional(participants,
1768                                                           data_dir=data_dir,
1769                                                           url=None,
1770                                                           resume=resume,
1771                                                           verbose=verbose)
1772
1773    if reduce_confounds:
1774        regressors = _reduce_confounds(regressors, keep_confounds)
1775    return Bunch(func=funcs, confounds=regressors, phenotypic=participants,
1776                 description=fdescr)
1777
1778
1779def _filter_func_regressors_by_participants(participants, age_group):
1780    """ Filter functional and regressors based on participants
1781    """
1782    valid_age_groups = ('both', 'child', 'adult')
1783    if age_group not in valid_age_groups:
1784        raise ValueError("Wrong value for age_group={0}. "
1785                         "Valid arguments are: {1}".format(age_group,
1786                                                           valid_age_groups)
1787                         )
1788
1789    child_adult = participants['Child_Adult'].tolist()
1790
1791    if age_group != 'adult':
1792        child_count = child_adult.count('child')
1793    else:
1794        child_count = 0
1795
1796    if age_group != 'child':
1797        adult_count = child_adult.count('adult')
1798    else:
1799        adult_count = 0
1800    return adult_count, child_count
1801
1802
1803def _filter_csv_by_n_subjects(participants, n_adult, n_child):
1804    """Restrict the csv files to the adequate number of subjects
1805    """
1806    child_ids = participants[participants['Child_Adult'] ==
1807                             'child']['participant_id'][:n_child]
1808    adult_ids = participants[participants['Child_Adult'] ==
1809                             'adult']['participant_id'][:n_adult]
1810    ids = np.hstack([adult_ids, child_ids])
1811    participants = participants[np.in1d(participants['participant_id'], ids)]
1812    participants = participants[np.argsort(participants, order='Child_Adult')]
1813    return participants
1814
1815
1816def _set_invalid_n_subjects_to_max(n_subjects, max_subjects, age_group):
1817    """ If n_subjects is invalid, sets it to max.
1818    """
1819    if n_subjects is None:
1820        n_subjects = max_subjects
1821
1822    if (isinstance(n_subjects, numbers.Number) and
1823            ((n_subjects > max_subjects) or (n_subjects < 1))):
1824        warnings.warn("Wrong value for n_subjects={0}. The maximum "
1825                      "value (for age_group={1}) will be used instead: "
1826                      "n_subjects={2}"
1827                      .format(n_subjects, age_group, max_subjects))
1828        n_subjects = max_subjects
1829    return n_subjects
1830
1831
1832def _reduce_confounds(regressors, keep_confounds):
1833    reduced_regressors = []
1834    for in_file in regressors:
1835        out_file = in_file.replace('desc-confounds',
1836                                   'desc-reducedConfounds')
1837        if not os.path.isfile(out_file):
1838            confounds = np.recfromcsv(in_file, delimiter='\t')
1839            selected_confounds = confounds[keep_confounds]
1840            header = '\t'.join(selected_confounds.dtype.names)
1841            np.savetxt(out_file, np.array(selected_confounds.tolist()),
1842                       header=header, delimiter='\t', comments='')
1843        reduced_regressors.append(out_file)
1844    return reduced_regressors
1845
1846
1847# datasets originally belonging to nistats follow
1848
1849
1850@fill_doc
1851def fetch_language_localizer_demo_dataset(data_dir=None, verbose=1):
1852    """Download language localizer demo dataset.
1853
1854    Parameters
1855    ----------
1856    %(data_dir)s
1857    %(verbose)s
1858
1859    Returns
1860    -------
1861    data_dir : string
1862        Path to downloaded dataset.
1863
1864    downloaded_files : list of string
1865        Absolute paths of downloaded files on disk
1866
1867    """
1868    url = 'https://osf.io/3dj2a/download'
1869    # When it starts working again change back to:
1870    # url = 'https://osf.io/nh987/download'
1871    main_folder = 'fMRI-language-localizer-demo-dataset'
1872
1873    data_dir = _get_dataset_dir(main_folder, data_dir=data_dir,
1874                                verbose=verbose)
1875    # The files_spec needed for _fetch_files
1876    files_spec = [(main_folder + '.zip', url, {'move': main_folder + '.zip'})]
1877    # Only download if directory is empty
1878    # Directory will have been created by the call to _get_dataset_dir above
1879    if not os.listdir(data_dir):
1880        downloaded_files = _fetch_files(data_dir, files_spec, resume=True,
1881                                        verbose=verbose)
1882        _uncompress_file(downloaded_files[0])
1883
1884    file_list = [os.path.join(path, f) for
1885                 path, dirs, files in os.walk(data_dir) for f in files]
1886    return data_dir, sorted(file_list)
1887
1888
1889@fill_doc
1890def fetch_bids_langloc_dataset(data_dir=None, verbose=1):
1891    """Download language localizer example :term:`bids<BIDS>` dataset.
1892
1893    Parameters
1894    ----------
1895    %(data_dir)s
1896    %(verbose)s
1897
1898    Returns
1899    -------
1900    data_dir : string
1901        Path to downloaded dataset.
1902
1903    downloaded_files : list of string
1904        Absolute paths of downloaded files on disk.
1905
1906    """
1907    url = 'https://files.osf.io/v1/resources/9q7dv/providers/osfstorage/5888d9a76c613b01fc6acc4e'  # noqa: E501
1908    dataset_name = 'bids_langloc_example'
1909    main_folder = 'bids_langloc_dataset'
1910    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
1911                                verbose=verbose)
1912    # The files_spec needed for _fetch_files
1913    files_spec = [(main_folder + '.zip', url, {'move': main_folder + '.zip'})]
1914    if not os.path.exists(os.path.join(data_dir, main_folder)):
1915        downloaded_files = _fetch_files(data_dir, files_spec, resume=True,
1916                                        verbose=verbose)
1917        _uncompress_file(downloaded_files[0])
1918    main_path = os.path.join(data_dir, main_folder)
1919    file_list = [os.path.join(path, f) for
1920                 path, dirs, files in os.walk(main_path) for f in files]
1921    return os.path.join(data_dir, main_folder), sorted(file_list)
1922
1923
1924@fill_doc
1925def fetch_openneuro_dataset_index(data_dir=None,
1926                                  dataset_version='ds000030_R1.0.4',
1927                                  verbose=1):
1928    """Download a file with OpenNeuro :term:`BIDS` dataset index.
1929
1930    Downloading the index allows to explore the dataset directories
1931    to select specific files to download. The index is a sorted list of urls.
1932
1933    Parameters
1934    ----------
1935    %(data_dir)s
1936    dataset_version : string, optional
1937        Dataset version name. Assumes it is of the form [name]_[version].
1938        Default='ds000030_R1.0.4'.
1939    %(verbose)s
1940
1941    Returns
1942    -------
1943    urls_path : string
1944        Path to downloaded dataset index.
1945
1946    urls : list of string
1947        Sorted list of dataset directories.
1948
1949    """
1950    data_prefix = '{}/{}/uncompressed'.format(dataset_version.split('_')[0],
1951                                              dataset_version,
1952                                              )
1953    data_dir = _get_dataset_dir(data_prefix, data_dir=data_dir,
1954                                verbose=verbose)
1955
1956    file_url = 'https://osf.io/86xj7/download'
1957    final_download_path = os.path.join(data_dir, 'urls.json')
1958    downloaded_file_path = _fetch_files(data_dir=data_dir,
1959                                        files=[(final_download_path,
1960                                                file_url,
1961                                                {'move': final_download_path}
1962                                                )],
1963                                        resume=True
1964                                        )
1965    urls_path = downloaded_file_path[0]
1966    with open(urls_path, 'r') as json_file:
1967        urls = json.load(json_file)
1968    return urls_path, urls
1969
1970
1971def select_from_index(urls, inclusion_filters=None, exclusion_filters=None,
1972                      n_subjects=None):
1973    """Select subset of urls with given filters.
1974
1975    Parameters
1976    ----------
1977    urls : list of str
1978        List of dataset urls obtained from index download.
1979
1980    inclusion_filters : list of str, optional
1981        List of unix shell-style wildcard strings
1982        that will be used to filter the url list.
1983        If a filter matches the url it is retained for download.
1984        Multiple filters work on top of each other.
1985        Like an "and" logical operator, creating a more restrictive query.
1986        Inclusion and exclusion filters apply together.
1987        For example the filter '*task-rest*'' would keep only urls
1988        that contain the 'task-rest' string.
1989
1990    exclusion_filters : list of str, optional
1991        List of unix shell-style wildcard strings
1992        that will be used to filter the url list.
1993        If a filter matches the url it is discarded for download.
1994        Multiple filters work on top of each other.
1995        Like an "and" logical operator, creating a more restrictive query.
1996        Inclusion and exclusion filters apply together.
1997        For example the filter '*task-rest*' would discard all urls
1998        that contain the 'task-rest' string.
1999
2000    n_subjects : int, optional
2001        Number of subjects to download from the dataset. All by default.
2002
2003    Returns
2004    -------
2005    urls : list of string
2006        Sorted list of filtered dataset directories.
2007
2008    """
2009    inclusion_filters = inclusion_filters if inclusion_filters else []
2010    exclusion_filters = exclusion_filters if exclusion_filters else []
2011    # We apply filters to the urls
2012    for exclusion in exclusion_filters:
2013        urls = [url for url in urls if not fnmatch.fnmatch(url, exclusion)]
2014    for inclusion in inclusion_filters:
2015        urls = [url for url in urls if fnmatch.fnmatch(url, inclusion)]
2016
2017    # subject selection filter
2018    # from the url list we infer all available subjects like 'sub-xxx/'
2019    subject_regex = 'sub-[a-z|A-Z|0-9]*[_./]'
2020
2021    def infer_subjects(urls):
2022        subjects = set()
2023        for url in urls:
2024            if 'sub-' in url:
2025                subjects.add(re.search(subject_regex, url).group(0)[:-1])
2026        return sorted(subjects)
2027
2028    # We get a list of subjects (for the moment the first n subjects)
2029    selected_subjects = set(infer_subjects(urls)[:n_subjects])
2030    # We exclude urls of subjects not selected
2031    urls = [
2032        url for url in urls
2033        if 'sub-' not in url or re.search(subject_regex, url).group(0)[:-1]
2034           in selected_subjects
2035    ]
2036    return urls
2037
2038
2039def patch_openneuro_dataset(file_list):
2040    """Add symlinks for files not named according to latest :term:`BIDS` conventions.
2041    """
2042    rep = {'_T1w_brainmask': '_desc-brain_mask',
2043           '_T1w_preproc': '_desc-preproc_T1w',
2044           '_T1w_space-MNI152NLin2009cAsym_brainmask':
2045               '_space-MNI152NLin2009cAsym_desc-brain_mask',
2046           '_T1w_space-MNI152NLin2009cAsym_class-':
2047               '_space-MNI152NLin2009cAsym_label-',
2048           '_T1w_space-MNI152NLin2009cAsym_preproc':
2049               '_space-MNI152NLin2009cAsym_desc-preproc_T1w',
2050           '_bold_confounds': '_desc-confounds_regressors',
2051           '_bold_space-MNI152NLin2009cAsym_brainmask':
2052               '_space-MNI152NLin2009cAsym_desc-brain_mask',
2053           '_bold_space-MNI152NLin2009cAsym_preproc':
2054               '_space-MNI152NLin2009cAsym_desc-preproc_bold'
2055           }
2056    # Create a symlink if a file with the modified filename does not exist
2057    for old in rep:
2058        for name in file_list:
2059            if old in name:
2060                if not os.path.exists(name.replace(old, rep[old])):
2061                    os.symlink(name, name.replace(old, rep[old]))
2062
2063
2064@fill_doc
2065def fetch_openneuro_dataset(
2066    urls=None, data_dir=None, dataset_version='ds000030_R1.0.4',
2067    verbose=1):
2068    """Download OpenNeuro :term:`BIDS` dataset.
2069
2070    Parameters
2071    ----------
2072    urls : list of string, optional
2073        Openneuro url list of dataset files to download. If not specified
2074        all files of the specified dataset will be downloaded.
2075    %(data_dir)s
2076    dataset_version : string, optional
2077        Dataset version name. Assumes it is of the form [name]_[version].
2078        Default is `ds000030_R1.0.4`.
2079    %(verbose)s
2080
2081    Returns
2082    -------
2083    data_dir : string
2084        Path to downloaded dataset.
2085
2086    downloaded_files : list of string
2087        Absolute paths of downloaded files on disk.
2088
2089    """
2090    data_prefix = '{}/{}/uncompressed'.format(
2091        dataset_version.split('_')[0], dataset_version)
2092    data_dir = _get_dataset_dir(data_prefix, data_dir=data_dir,
2093                                verbose=verbose)
2094
2095    # if urls are not specified we download the complete dataset index
2096    if urls is None:
2097        _, urls = fetch_openneuro_dataset_index(
2098            data_dir=data_dir, dataset_version=dataset_version,
2099            verbose=verbose)
2100
2101    # The files_spec needed for _fetch_files
2102    files_spec = []
2103    files_dir = []
2104    for url in urls:
2105        url_path = url.split(data_prefix + '/')[1]
2106        file_dir = os.path.join(data_dir, url_path)
2107        files_spec.append((os.path.basename(file_dir), url, {}))
2108        files_dir.append(os.path.dirname(file_dir))
2109
2110    # download the files
2111    downloaded = []
2112    for file_spec, file_dir in zip(files_spec, files_dir):
2113        # Timeout errors are common in the s3 connection so we try to avoid
2114        # failure of the dataset download for a transient instability
2115        success = False
2116        download_attempts = 4
2117        while download_attempts > 0 and not success:
2118            try:
2119                downloaded_files = _fetch_files(
2120                    file_dir, [file_spec], resume=True, verbose=verbose)
2121                downloaded += downloaded_files
2122                success = True
2123            except Exception:
2124                download_attempts -= 1
2125        if not success:
2126            raise Exception('multiple failures downloading %s' % file_spec[1])
2127    patch_openneuro_dataset(downloaded)
2128
2129    return data_dir, sorted(downloaded)
2130
2131
2132@fill_doc
2133def fetch_localizer_first_level(data_dir=None, verbose=1):
2134    """Download a first-level localizer fMRI dataset
2135
2136    Parameters
2137    ----------
2138    %(data_dir)s
2139    %(verbose)s
2140
2141    Returns
2142    -------
2143    data : sklearn.datasets.base.Bunch
2144        Dictionary-like object, with the keys:
2145        epi_img: the input 4D image
2146        events: a csv file describing the paardigm
2147
2148    """
2149    url = 'https://osf.io/2bqxn/download'
2150    epi_img = 'sub-12069_task-localizer_space-MNI305.nii.gz'
2151    events = 'sub-12069_task-localizer_events.tsv'
2152    opts = {'uncompress': True}
2153    options = ('epi_img', 'events')
2154    dir_ = 'localizer_first_level'
2155    filenames = [(os.path.join(dir_, name), url, opts)
2156                 for name in [epi_img, events]]
2157
2158    dataset_name = 'localizer_first_level'
2159    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir,
2160                                verbose=verbose)
2161    files = _fetch_files(data_dir, filenames, verbose=verbose)
2162
2163    params = dict(list(zip(options, files)))
2164    return Bunch(**params)
2165
2166
2167def _download_spm_auditory_data(data_dir, subject_dir, subject_id):
2168    print('Data absent, downloading...')
2169    url = ('http://www.fil.ion.ucl.ac.uk/spm/download/data/MoAEpilot/'
2170           'MoAEpilot.zip')
2171    archive_path = os.path.join(subject_dir, os.path.basename(url))
2172    _fetch_file(url, subject_dir)
2173    try:
2174        _uncompress_file(archive_path)
2175    except:  # noqa: E722
2176        print('Archive corrupted, trying to download it again.')
2177        return fetch_spm_auditory(data_dir=data_dir, data_name='',
2178                                  subject_id=subject_id)
2179
2180
2181def _prepare_downloaded_spm_auditory_data(subject_dir):
2182    """ Uncompresses downloaded spm_auditory dataset and organizes
2183    the data into appropriate directories.
2184
2185    Parameters
2186    ----------
2187    subject_dir : string
2188        Path to subject's data directory.
2189
2190    Returns
2191    -------
2192    _subject_data : skl.Bunch object
2193        Scikit-Learn Bunch object containing data of a single subject
2194        from the SPM Auditory dataset.
2195
2196    """
2197    subject_data = {}
2198    spm_auditory_data_files = ["fM00223/fM00223_%03i.img" % index
2199                               for index in range(4, 100)]
2200    spm_auditory_data_files.append("sM00223/sM00223_002.img")
2201
2202    for file_name in spm_auditory_data_files:
2203        file_path = os.path.join(subject_dir, file_name)
2204        if os.path.exists(file_path):
2205            subject_data[file_name] = file_path
2206        else:
2207            print('%s missing from filelist!' % file_name)
2208            return None
2209
2210    _subject_data = {}
2211    _subject_data['func'] = sorted(
2212        [subject_data[x] for x in subject_data.keys()
2213         if re.match(r'^fM00223_0\d\d\.img$',
2214                     os.path.basename(x))])
2215
2216    # volumes for this dataset of shape (64, 64, 64, 1); let's fix this
2217    for x in _subject_data['func']:
2218        vol = nib.load(x)
2219        if len(vol.shape) == 4:
2220            vol = nib.Nifti1Image(get_data(vol)[:, :, :, 0],
2221                                  vol.affine)
2222            nib.save(vol, x)
2223
2224    _subject_data['anat'] = [subject_data[x] for x in subject_data.keys()
2225                             if re.match(r'^sM00223_002\.img$',
2226                                         os.path.basename(x))][0]
2227
2228    # ... same thing for anat
2229    vol = nib.load(_subject_data['anat'])
2230    if len(vol.shape) == 4:
2231        vol = nib.Nifti1Image(get_data(vol)[:, :, :, 0],
2232                              vol.affine)
2233        nib.save(vol, _subject_data['anat'])
2234
2235    return Bunch(**_subject_data)
2236
2237
2238def _make_path_events_file_spm_auditory_data(spm_auditory_data):
2239    """Accepts data for spm_auditory dataset as Bunch
2240    and constructs the filepath for its events descriptor file.
2241
2242    Parameters
2243    ----------
2244    spm_auditory_data : Bunch
2245
2246    Returns
2247    -------
2248    events_filepath : string
2249        Full path to the events.tsv file for spm_auditory dataset.
2250
2251    """
2252    events_file_location = os.path.dirname(spm_auditory_data['func'][0])
2253    events_filename = os.path.basename(events_file_location) + '_events.tsv'
2254    events_filepath = os.path.join(events_file_location, events_filename)
2255    return events_filepath
2256
2257
2258def _make_events_file_spm_auditory_data(events_filepath):
2259    """Accepts destination filepath including filename and
2260    creates the events.tsv file for the spm_auditory dataset.
2261
2262    Parameters
2263    ----------
2264    events_filepath : string
2265        The path where the events file will be created.
2266
2267    Returns
2268    -------
2269    None
2270
2271    """
2272    tr = 7.
2273    epoch_duration = 6 * tr  # duration in seconds
2274    conditions = ['rest', 'active'] * 8
2275    n_blocks = len(conditions)
2276    duration = epoch_duration * np.ones(n_blocks)
2277    onset = np.linspace(0, (n_blocks - 1) * epoch_duration, n_blocks)
2278    events = pd.DataFrame(
2279        {'onset': onset, 'duration': duration, 'trial_type': conditions})
2280    events.to_csv(events_filepath, sep='\t', index=False,
2281                  columns=['onset', 'duration', 'trial_type'])
2282
2283
2284@fill_doc
2285def fetch_spm_auditory(data_dir=None, data_name='spm_auditory',
2286                       subject_id='sub001', verbose=1):
2287    """Function to fetch SPM auditory single-subject data.
2288
2289    See :footcite:`spm_auditory`.
2290
2291    Parameters
2292    ----------
2293    %(data_dir)s
2294    data_name : string, optional
2295        Name of the dataset. Default='spm_auditory'.
2296
2297    subject_id : string, optional
2298        Indicates which subject to retrieve.
2299        Default='sub001'.
2300    %(verbose)s
2301
2302    Returns
2303    -------
2304    data : sklearn.datasets.base.Bunch
2305        Dictionary-like object, the interest attributes are:
2306        - 'func': string list. Paths to functional images
2307        - 'anat': string list. Path to anat image
2308
2309    References
2310    ----------
2311    .. footbibliography::
2312
2313    """
2314    data_dir = _get_dataset_dir(data_name, data_dir=data_dir,
2315                                verbose=verbose)
2316    subject_dir = os.path.join(data_dir, subject_id)
2317    if not os.path.exists(subject_dir):
2318        _download_spm_auditory_data(data_dir, subject_dir, subject_id)
2319    spm_auditory_data = _prepare_downloaded_spm_auditory_data(subject_dir)
2320    try:
2321        spm_auditory_data['events']
2322    except KeyError:
2323        events_filepath = _make_path_events_file_spm_auditory_data(
2324            spm_auditory_data)
2325        if not os.path.isfile(events_filepath):
2326            _make_events_file_spm_auditory_data(events_filepath)
2327        spm_auditory_data['events'] = events_filepath
2328    return spm_auditory_data
2329
2330
2331def _get_func_data_spm_multimodal(subject_dir, session, _subject_data):
2332    session_func = sorted(glob.glob(
2333        os.path.join(
2334            subject_dir,
2335            ('fMRI/Session%i/fMETHODS-000%i-*-01.img' % (
2336                session, session + 4)
2337             )
2338        )
2339    ))
2340    if len(session_func) < 390:
2341        print('Missing %i functional scans for session %i.' % (
2342            390 - len(session_func), session))
2343        return None
2344
2345    _subject_data['func%i' % (session)] = session_func
2346    return _subject_data
2347
2348
2349def _get_session_trials_spm_multimodal(subject_dir, session, _subject_data):
2350    sess_trials = os.path.join(
2351        subject_dir,
2352        'fMRI/trials_ses%i.mat' % (session))
2353    if not os.path.isfile(sess_trials):
2354        print('Missing session file: %s' % sess_trials)
2355        return None
2356
2357    _subject_data['trials_ses%i' % (session)] = sess_trials
2358    return _subject_data
2359
2360
2361def _get_anatomical_data_spm_multimodal(subject_dir, _subject_data):
2362    anat = os.path.join(subject_dir, 'sMRI/smri.img')
2363    if not os.path.isfile(anat):
2364        print('Missing structural image.')
2365        return None
2366
2367    _subject_data['anat'] = anat
2368    return _subject_data
2369
2370
2371def _glob_spm_multimodal_fmri_data(subject_dir):
2372    """glob data from subject_dir."""
2373    _subject_data = {'slice_order': 'descending'}
2374
2375    for session in range(1, 3):
2376        # glob func data for session
2377        _subject_data = _get_func_data_spm_multimodal(subject_dir,
2378                                                      session,
2379                                                      _subject_data)
2380        if not _subject_data:
2381            return None
2382        # glob trials .mat file
2383        _subject_data = _get_session_trials_spm_multimodal(subject_dir,
2384                                                           session,
2385                                                           _subject_data)
2386        if not _subject_data:
2387            return None
2388        try:
2389            events = _make_events_file_spm_multimodal_fmri(_subject_data,
2390                                                           session)
2391        except MatReadError as mat_err:
2392            warnings.warn(
2393                '{}. An events.tsv file '
2394                'cannot be generated'.format(str(mat_err)))
2395        else:
2396            events_filepath = _make_events_filepath_spm_multimodal_fmri(
2397                _subject_data, session)
2398            events.to_csv(events_filepath, sep='\t', index=False)
2399            _subject_data['events{}'.format(session)] = events_filepath
2400
2401    # glob for anat data
2402    _subject_data = _get_anatomical_data_spm_multimodal(subject_dir,
2403                                                        _subject_data)
2404    if not _subject_data:
2405        return None
2406
2407    return Bunch(**_subject_data)
2408
2409
2410def _download_data_spm_multimodal(data_dir, subject_dir, subject_id):
2411    print('Data absent, downloading...')
2412    urls = [
2413        # fmri
2414        ('http://www.fil.ion.ucl.ac.uk/spm/download/data/mmfaces/'
2415         'multimodal_fmri.zip'),
2416        # structural
2417        ('http://www.fil.ion.ucl.ac.uk/spm/download/data/mmfaces/'
2418         'multimodal_smri.zip')
2419    ]
2420
2421    for url in urls:
2422        archive_path = os.path.join(subject_dir, os.path.basename(url))
2423        _fetch_file(url, subject_dir)
2424        try:
2425            _uncompress_file(archive_path)
2426        except:  # noqa: E722
2427            print('Archive corrupted, trying to download it again.')
2428            return fetch_spm_multimodal_fmri(data_dir=data_dir,
2429                                             data_name='',
2430                                             subject_id=subject_id)
2431
2432    return _glob_spm_multimodal_fmri_data(subject_dir)
2433
2434
2435def _make_events_filepath_spm_multimodal_fmri(_subject_data, session):
2436    key = 'trials_ses{}'.format(session)
2437    events_file_location = os.path.dirname(_subject_data[key])
2438    events_filename = 'session{}_events.tsv'.format(session)
2439    events_filepath = os.path.join(events_file_location, events_filename)
2440    return events_filepath
2441
2442
2443def _make_events_file_spm_multimodal_fmri(_subject_data, session):
2444    tr = 2.
2445    timing = loadmat(_subject_data['trials_ses%i' % (session)],
2446                     squeeze_me=True, struct_as_record=False)
2447    faces_onsets = timing['onsets'][0].ravel()
2448    scrambled_onsets = timing['onsets'][1].ravel()
2449    onsets = np.hstack((faces_onsets, scrambled_onsets))
2450    onsets *= tr  # because onsets were reporting in 'scans' units
2451    conditions = (
2452        ['faces'] * len(faces_onsets) + ['scrambled'] * len(scrambled_onsets)
2453    )
2454    duration = np.ones_like(onsets)
2455    events = pd.DataFrame({'trial_type': conditions, 'onset': onsets,
2456                           'duration': duration})
2457    return events
2458
2459
2460@fill_doc
2461def fetch_spm_multimodal_fmri(data_dir=None, data_name='spm_multimodal_fmri',
2462                              subject_id='sub001', verbose=1):
2463    """Fetcher for Multi-modal Face Dataset.
2464
2465    See :footcite:`spm_multiface`.
2466
2467    Parameters
2468    ----------
2469    %(data_dir)s
2470    data_name : string, optional
2471        Name of the dataset. Default='spm_multimodal_fmri'.
2472
2473    subject_id : string, optional
2474        Indicates which subject to retrieve. Default='sub001'.
2475    %(verbose)s
2476
2477    Returns
2478    -------
2479    data : sklearn.datasets.base.Bunch
2480        Dictionary-like object, the interest attributes are:
2481        - 'func1': string list. Paths to functional images for session 1
2482        - 'func2': string list. Paths to functional images for session 2
2483        - 'trials_ses1': string list. Path to onsets file for session 1
2484        - 'trials_ses2': string list. Path to onsets file for session 2
2485        - 'anat': string. Path to anat file
2486
2487    References
2488    ----------
2489    .. footbibliography::
2490
2491    """
2492    data_dir = _get_dataset_dir(data_name, data_dir=data_dir, verbose=verbose)
2493    subject_dir = os.path.join(data_dir, subject_id)
2494
2495    # maybe data_dir already contains the data ?
2496    data = _glob_spm_multimodal_fmri_data(subject_dir)
2497    if data is not None:
2498        return data
2499
2500    # No. Download the data
2501    return _download_data_spm_multimodal(data_dir, subject_dir, subject_id)
2502
2503
2504@fill_doc
2505def fetch_fiac_first_level(data_dir=None, verbose=1):
2506    """Download a first-level fiac fMRI dataset (2 sessions)
2507
2508    Parameters
2509    ----------
2510    %(data_dir)s
2511    %(verbose)s
2512
2513    """
2514    data_dir = _get_dataset_dir('fiac_nilearn.glm', data_dir=data_dir,
2515                                verbose=verbose)
2516
2517    def _glob_fiac_data():
2518        """glob data from subject_dir."""
2519        _subject_data = {}
2520        subject_dir = os.path.join(data_dir, 'nipy-data-0.2/data/fiac/fiac0')
2521        for session in [1, 2]:
2522            # glob func data for session
2523            session_func = os.path.join(subject_dir, 'run%i.nii.gz' % session)
2524            if not os.path.isfile(session_func):
2525                print('Missing functional scan for session %i.' % session)
2526                return None
2527
2528            _subject_data['func%i' % session] = session_func
2529
2530            # glob design matrix .npz file
2531            sess_dmtx = os.path.join(subject_dir,
2532                                     'run%i_design.npz' % session)
2533            if not os.path.isfile(sess_dmtx):
2534                print('Missing session file: %s' % sess_dmtx)
2535                return None
2536
2537            _subject_data['design_matrix%i' % session] = sess_dmtx
2538
2539        # glob for mask data
2540        mask = os.path.join(subject_dir, 'mask.nii.gz')
2541        if not os.path.isfile(mask):
2542            print('Missing mask image.')
2543            return None
2544
2545        _subject_data['mask'] = mask
2546        return Bunch(**_subject_data)
2547
2548    # maybe data_dir already contains the data ?
2549    data = _glob_fiac_data()
2550    if data is not None:
2551        return data
2552
2553    # No. Download the data
2554    print('Data absent, downloading...')
2555    url = 'http://nipy.sourceforge.net/data-packages/nipy-data-0.2.tar.gz'
2556
2557    archive_path = os.path.join(data_dir, os.path.basename(url))
2558    _fetch_file(url, data_dir)
2559    try:
2560        _uncompress_file(archive_path)
2561    except:  # noqa: E722
2562        print('Archive corrupted, trying to download it again.')
2563        return fetch_fiac_first_level(data_dir=data_dir)
2564
2565    return _glob_fiac_data()
2566