1"""
2Base IO code for all datasets
3"""
4
5# Copyright (c) 2007 David Cournapeau <cournape@gmail.com>
6#               2010 Fabian Pedregosa <fabian.pedregosa@inria.fr>
7#               2010 Olivier Grisel <olivier.grisel@ensta.org>
8# License: BSD 3 clause
9import csv
10import hashlib
11import gzip
12import shutil
13from collections import namedtuple
14from os import environ, listdir, makedirs
15from os.path import expanduser, isdir, join, splitext
16from importlib import resources
17
18from ..utils import Bunch
19from ..utils import check_random_state
20from ..utils import check_pandas_support
21from ..utils.deprecation import deprecated
22
23import numpy as np
24
25from urllib.request import urlretrieve
26
27DATA_MODULE = "sklearn.datasets.data"
28DESCR_MODULE = "sklearn.datasets.descr"
29IMAGES_MODULE = "sklearn.datasets.images"
30
31RemoteFileMetadata = namedtuple("RemoteFileMetadata", ["filename", "url", "checksum"])
32
33
34def get_data_home(data_home=None) -> str:
35    """Return the path of the scikit-learn data dir.
36
37    This folder is used by some large dataset loaders to avoid downloading the
38    data several times.
39
40    By default the data dir is set to a folder named 'scikit_learn_data' in the
41    user home folder.
42
43    Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment
44    variable or programmatically by giving an explicit folder path. The '~'
45    symbol is expanded to the user home folder.
46
47    If the folder does not already exist, it is automatically created.
48
49    Parameters
50    ----------
51    data_home : str, default=None
52        The path to scikit-learn data directory. If `None`, the default path
53        is `~/sklearn_learn_data`.
54    """
55    if data_home is None:
56        data_home = environ.get("SCIKIT_LEARN_DATA", join("~", "scikit_learn_data"))
57    data_home = expanduser(data_home)
58    makedirs(data_home, exist_ok=True)
59    return data_home
60
61
62def clear_data_home(data_home=None):
63    """Delete all the content of the data home cache.
64
65    Parameters
66    ----------
67    data_home : str, default=None
68        The path to scikit-learn data directory. If `None`, the default path
69        is `~/sklearn_learn_data`.
70    """
71    data_home = get_data_home(data_home)
72    shutil.rmtree(data_home)
73
74
75def _convert_data_dataframe(
76    caller_name, data, target, feature_names, target_names, sparse_data=False
77):
78    pd = check_pandas_support("{} with as_frame=True".format(caller_name))
79    if not sparse_data:
80        data_df = pd.DataFrame(data, columns=feature_names)
81    else:
82        data_df = pd.DataFrame.sparse.from_spmatrix(data, columns=feature_names)
83
84    target_df = pd.DataFrame(target, columns=target_names)
85    combined_df = pd.concat([data_df, target_df], axis=1)
86    X = combined_df[feature_names]
87    y = combined_df[target_names]
88    if y.shape[1] == 1:
89        y = y.iloc[:, 0]
90    return combined_df, X, y
91
92
93def load_files(
94    container_path,
95    *,
96    description=None,
97    categories=None,
98    load_content=True,
99    shuffle=True,
100    encoding=None,
101    decode_error="strict",
102    random_state=0,
103):
104    """Load text files with categories as subfolder names.
105
106    Individual samples are assumed to be files stored a two levels folder
107    structure such as the following:
108
109        container_folder/
110            category_1_folder/
111                file_1.txt
112                file_2.txt
113                ...
114                file_42.txt
115            category_2_folder/
116                file_43.txt
117                file_44.txt
118                ...
119
120    The folder names are used as supervised signal label names. The individual
121    file names are not important.
122
123    This function does not try to extract features into a numpy array or scipy
124    sparse matrix. In addition, if load_content is false it does not try to
125    load the files in memory.
126
127    To use text files in a scikit-learn classification or clustering algorithm,
128    you will need to use the :mod`~sklearn.feature_extraction.text` module to
129    build a feature extraction transformer that suits your problem.
130
131    If you set load_content=True, you should also specify the encoding of the
132    text using the 'encoding' parameter. For many modern text files, 'utf-8'
133    will be the correct encoding. If you leave encoding equal to None, then the
134    content will be made of bytes instead of Unicode, and you will not be able
135    to use most functions in :mod:`~sklearn.feature_extraction.text`.
136
137    Similar feature extractors should be built for other kind of unstructured
138    data input such as images, audio, video, ...
139
140    Read more in the :ref:`User Guide <datasets>`.
141
142    Parameters
143    ----------
144    container_path : str
145        Path to the main folder holding one subfolder per category.
146
147    description : str, default=None
148        A paragraph describing the characteristic of the dataset: its source,
149        reference, etc.
150
151    categories : list of str, default=None
152        If None (default), load all the categories. If not None, list of
153        category names to load (other categories ignored).
154
155    load_content : bool, default=True
156        Whether to load or not the content of the different files. If true a
157        'data' attribute containing the text information is present in the data
158        structure returned. If not, a filenames attribute gives the path to the
159        files.
160
161    shuffle : bool, default=True
162        Whether or not to shuffle the data: might be important for models that
163        make the assumption that the samples are independent and identically
164        distributed (i.i.d.), such as stochastic gradient descent.
165
166    encoding : str, default=None
167        If None, do not try to decode the content of the files (e.g. for images
168        or other non-text content). If not None, encoding to use to decode text
169        files to Unicode if load_content is True.
170
171    decode_error : {'strict', 'ignore', 'replace'}, default='strict'
172        Instruction on what to do if a byte sequence is given to analyze that
173        contains characters not of the given `encoding`. Passed as keyword
174        argument 'errors' to bytes.decode.
175
176    random_state : int, RandomState instance or None, default=0
177        Determines random number generation for dataset shuffling. Pass an int
178        for reproducible output across multiple function calls.
179        See :term:`Glossary <random_state>`.
180
181    Returns
182    -------
183    data : :class:`~sklearn.utils.Bunch`
184        Dictionary-like object, with the following attributes.
185
186        data : list of str
187            Only present when `load_content=True`.
188            The raw text data to learn.
189        target : ndarray
190            The target labels (integer index).
191        target_names : list
192            The names of target classes.
193        DESCR : str
194            The full description of the dataset.
195        filenames: ndarray
196            The filenames holding the dataset.
197    """
198    target = []
199    target_names = []
200    filenames = []
201
202    folders = [
203        f for f in sorted(listdir(container_path)) if isdir(join(container_path, f))
204    ]
205
206    if categories is not None:
207        folders = [f for f in folders if f in categories]
208
209    for label, folder in enumerate(folders):
210        target_names.append(folder)
211        folder_path = join(container_path, folder)
212        documents = [join(folder_path, d) for d in sorted(listdir(folder_path))]
213        target.extend(len(documents) * [label])
214        filenames.extend(documents)
215
216    # convert to array for fancy indexing
217    filenames = np.array(filenames)
218    target = np.array(target)
219
220    if shuffle:
221        random_state = check_random_state(random_state)
222        indices = np.arange(filenames.shape[0])
223        random_state.shuffle(indices)
224        filenames = filenames[indices]
225        target = target[indices]
226
227    if load_content:
228        data = []
229        for filename in filenames:
230            with open(filename, "rb") as f:
231                data.append(f.read())
232        if encoding is not None:
233            data = [d.decode(encoding, decode_error) for d in data]
234        return Bunch(
235            data=data,
236            filenames=filenames,
237            target_names=target_names,
238            target=target,
239            DESCR=description,
240        )
241
242    return Bunch(
243        filenames=filenames, target_names=target_names, target=target, DESCR=description
244    )
245
246
247def load_csv_data(
248    data_file_name,
249    *,
250    data_module=DATA_MODULE,
251    descr_file_name=None,
252    descr_module=DESCR_MODULE,
253):
254    """Loads `data_file_name` from `data_module with `importlib.resources`.
255
256    Parameters
257    ----------
258    data_file_name : str
259        Name of csv file to be loaded from `data_module/data_file_name`.
260        For example `'wine_data.csv'`.
261
262    data_module : str or module, default='sklearn.datasets.data'
263        Module where data lives. The default is `'sklearn.datasets.data'`.
264
265    descr_file_name : str, default=None
266        Name of rst file to be loaded from `descr_module/descr_file_name`.
267        For example `'wine_data.rst'`. See also :func:`load_descr`.
268        If not None, also returns the corresponding description of
269        the dataset.
270
271    descr_module : str or module, default='sklearn.datasets.descr'
272        Module where `descr_file_name` lives. See also :func:`load_descr`.
273        The default is `'sklearn.datasets.descr'`.
274
275    Returns
276    -------
277    data : ndarray of shape (n_samples, n_features)
278        A 2D array with each row representing one sample and each column
279        representing the features of a given sample.
280
281    target : ndarry of shape (n_samples,)
282        A 1D array holding target variables for all the samples in `data`.
283        For example target[0] is the target variable for data[0].
284
285    target_names : ndarry of shape (n_samples,)
286        A 1D array containing the names of the classifications. For example
287        target_names[0] is the name of the target[0] class.
288
289    descr : str, optional
290        Description of the dataset (the content of `descr_file_name`).
291        Only returned if `descr_file_name` is not None.
292    """
293    with resources.open_text(data_module, data_file_name) as csv_file:
294        data_file = csv.reader(csv_file)
295        temp = next(data_file)
296        n_samples = int(temp[0])
297        n_features = int(temp[1])
298        target_names = np.array(temp[2:])
299        data = np.empty((n_samples, n_features))
300        target = np.empty((n_samples,), dtype=int)
301
302        for i, ir in enumerate(data_file):
303            data[i] = np.asarray(ir[:-1], dtype=np.float64)
304            target[i] = np.asarray(ir[-1], dtype=int)
305
306    if descr_file_name is None:
307        return data, target, target_names
308    else:
309        assert descr_module is not None
310        descr = load_descr(descr_module=descr_module, descr_file_name=descr_file_name)
311        return data, target, target_names, descr
312
313
314def load_gzip_compressed_csv_data(
315    data_file_name,
316    *,
317    data_module=DATA_MODULE,
318    descr_file_name=None,
319    descr_module=DESCR_MODULE,
320    encoding="utf-8",
321    **kwargs,
322):
323    """Loads gzip-compressed `data_file_name` from `data_module` with `importlib.resources`.
324
325    1) Open resource file with `importlib.resources.open_binary`
326    2) Decompress file obj with `gzip.open`
327    3) Load decompressed data with `np.loadtxt`
328
329    Parameters
330    ----------
331    data_file_name : str
332        Name of gzip-compressed csv file  (`'*.csv.gz'`) to be loaded from
333        `data_module/data_file_name`. For example `'diabetes_data.csv.gz'`.
334
335    data_module : str or module, default='sklearn.datasets.data'
336        Module where data lives. The default is `'sklearn.datasets.data'`.
337
338    descr_file_name : str, default=None
339        Name of rst file to be loaded from `descr_module/descr_file_name`.
340        For example `'wine_data.rst'`. See also :func:`load_descr`.
341        If not None, also returns the corresponding description of
342        the dataset.
343
344    descr_module : str or module, default='sklearn.datasets.descr'
345        Module where `descr_file_name` lives. See also :func:`load_descr`.
346        The default  is `'sklearn.datasets.descr'`.
347
348    encoding : str, default="utf-8"
349        Name of the encoding that the gzip-decompressed file will be
350        decoded with. The default is 'utf-8'.
351
352    **kwargs : dict, optional
353        Keyword arguments to be passed to `np.loadtxt`;
354        e.g. delimiter=','.
355
356    Returns
357    -------
358    data : ndarray of shape (n_samples, n_features)
359        A 2D array with each row representing one sample and each column
360        representing the features and/or target of a given sample.
361
362    descr : str, optional
363        Description of the dataset (the content of `descr_file_name`).
364        Only returned if `descr_file_name` is not None.
365    """
366    with resources.open_binary(data_module, data_file_name) as compressed_file:
367        compressed_file = gzip.open(compressed_file, mode="rt", encoding=encoding)
368        data = np.loadtxt(compressed_file, **kwargs)
369
370    if descr_file_name is None:
371        return data
372    else:
373        assert descr_module is not None
374        descr = load_descr(descr_module=descr_module, descr_file_name=descr_file_name)
375        return data, descr
376
377
378def load_descr(descr_file_name, *, descr_module=DESCR_MODULE):
379    """Load `descr_file_name` from `descr_module` with `importlib.resources`.
380
381    Parameters
382    ----------
383    descr_file_name : str, default=None
384        Name of rst file to be loaded from `descr_module/descr_file_name`.
385        For example `'wine_data.rst'`. See also :func:`load_descr`.
386        If not None, also returns the corresponding description of
387        the dataset.
388
389    descr_module : str or module, default='sklearn.datasets.descr'
390        Module where `descr_file_name` lives. See also :func:`load_descr`.
391        The default  is `'sklearn.datasets.descr'`.
392
393    Returns
394    -------
395    fdescr : str
396        Content of `descr_file_name`.
397    """
398    fdescr = resources.read_text(descr_module, descr_file_name)
399
400    return fdescr
401
402
403def load_wine(*, return_X_y=False, as_frame=False):
404    """Load and return the wine dataset (classification).
405
406    .. versionadded:: 0.18
407
408    The wine dataset is a classic and very easy multi-class classification
409    dataset.
410
411    =================   ==============
412    Classes                          3
413    Samples per class        [59,71,48]
414    Samples total                  178
415    Dimensionality                  13
416    Features            real, positive
417    =================   ==============
418
419    Read more in the :ref:`User Guide <wine_dataset>`.
420
421    Parameters
422    ----------
423    return_X_y : bool, default=False
424        If True, returns ``(data, target)`` instead of a Bunch object.
425        See below for more information about the `data` and `target` object.
426
427    as_frame : bool, default=False
428        If True, the data is a pandas DataFrame including columns with
429        appropriate dtypes (numeric). The target is
430        a pandas DataFrame or Series depending on the number of target columns.
431        If `return_X_y` is True, then (`data`, `target`) will be pandas
432        DataFrames or Series as described below.
433
434        .. versionadded:: 0.23
435
436    Returns
437    -------
438    data : :class:`~sklearn.utils.Bunch`
439        Dictionary-like object, with the following attributes.
440
441        data : {ndarray, dataframe} of shape (178, 13)
442            The data matrix. If `as_frame=True`, `data` will be a pandas
443            DataFrame.
444        target: {ndarray, Series} of shape (178,)
445            The classification target. If `as_frame=True`, `target` will be
446            a pandas Series.
447        feature_names: list
448            The names of the dataset columns.
449        target_names: list
450            The names of target classes.
451        frame: DataFrame of shape (178, 14)
452            Only present when `as_frame=True`. DataFrame with `data` and
453            `target`.
454
455            .. versionadded:: 0.23
456        DESCR: str
457            The full description of the dataset.
458
459    (data, target) : tuple if ``return_X_y`` is True
460
461    The copy of UCI ML Wine Data Set dataset is downloaded and modified to fit
462    standard format from:
463    https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
464
465    Examples
466    --------
467    Let's say you are interested in the samples 10, 80, and 140, and want to
468    know their class name.
469
470    >>> from sklearn.datasets import load_wine
471    >>> data = load_wine()
472    >>> data.target[[10, 80, 140]]
473    array([0, 1, 2])
474    >>> list(data.target_names)
475    ['class_0', 'class_1', 'class_2']
476    """
477
478    data, target, target_names, fdescr = load_csv_data(
479        data_file_name="wine_data.csv", descr_file_name="wine_data.rst"
480    )
481
482    feature_names = [
483        "alcohol",
484        "malic_acid",
485        "ash",
486        "alcalinity_of_ash",
487        "magnesium",
488        "total_phenols",
489        "flavanoids",
490        "nonflavanoid_phenols",
491        "proanthocyanins",
492        "color_intensity",
493        "hue",
494        "od280/od315_of_diluted_wines",
495        "proline",
496    ]
497
498    frame = None
499    target_columns = [
500        "target",
501    ]
502    if as_frame:
503        frame, data, target = _convert_data_dataframe(
504            "load_wine", data, target, feature_names, target_columns
505        )
506
507    if return_X_y:
508        return data, target
509
510    return Bunch(
511        data=data,
512        target=target,
513        frame=frame,
514        target_names=target_names,
515        DESCR=fdescr,
516        feature_names=feature_names,
517    )
518
519
520def load_iris(*, return_X_y=False, as_frame=False):
521    """Load and return the iris dataset (classification).
522
523    The iris dataset is a classic and very easy multi-class classification
524    dataset.
525
526    =================   ==============
527    Classes                          3
528    Samples per class               50
529    Samples total                  150
530    Dimensionality                   4
531    Features            real, positive
532    =================   ==============
533
534    Read more in the :ref:`User Guide <iris_dataset>`.
535
536    Parameters
537    ----------
538    return_X_y : bool, default=False
539        If True, returns ``(data, target)`` instead of a Bunch object. See
540        below for more information about the `data` and `target` object.
541
542        .. versionadded:: 0.18
543
544    as_frame : bool, default=False
545        If True, the data is a pandas DataFrame including columns with
546        appropriate dtypes (numeric). The target is
547        a pandas DataFrame or Series depending on the number of target columns.
548        If `return_X_y` is True, then (`data`, `target`) will be pandas
549        DataFrames or Series as described below.
550
551        .. versionadded:: 0.23
552
553    Returns
554    -------
555    data : :class:`~sklearn.utils.Bunch`
556        Dictionary-like object, with the following attributes.
557
558        data : {ndarray, dataframe} of shape (150, 4)
559            The data matrix. If `as_frame=True`, `data` will be a pandas
560            DataFrame.
561        target: {ndarray, Series} of shape (150,)
562            The classification target. If `as_frame=True`, `target` will be
563            a pandas Series.
564        feature_names: list
565            The names of the dataset columns.
566        target_names: list
567            The names of target classes.
568        frame: DataFrame of shape (150, 5)
569            Only present when `as_frame=True`. DataFrame with `data` and
570            `target`.
571
572            .. versionadded:: 0.23
573        DESCR: str
574            The full description of the dataset.
575        filename: str
576            The path to the location of the data.
577
578            .. versionadded:: 0.20
579
580    (data, target) : tuple if ``return_X_y`` is True
581        A tuple of two ndarray. The first containing a 2D array of shape
582        (n_samples, n_features) with each row representing one sample and
583        each column representing the features. The second ndarray of shape
584        (n_samples,) containing the target samples.
585
586        .. versionadded:: 0.18
587
588    Notes
589    -----
590        .. versionchanged:: 0.20
591            Fixed two wrong data points according to Fisher's paper.
592            The new version is the same as in R, but not as in the UCI
593            Machine Learning Repository.
594
595    Examples
596    --------
597    Let's say you are interested in the samples 10, 25, and 50, and want to
598    know their class name.
599
600    >>> from sklearn.datasets import load_iris
601    >>> data = load_iris()
602    >>> data.target[[10, 25, 50]]
603    array([0, 0, 1])
604    >>> list(data.target_names)
605    ['setosa', 'versicolor', 'virginica']
606    """
607    data_file_name = "iris.csv"
608    data, target, target_names, fdescr = load_csv_data(
609        data_file_name=data_file_name, descr_file_name="iris.rst"
610    )
611
612    feature_names = [
613        "sepal length (cm)",
614        "sepal width (cm)",
615        "petal length (cm)",
616        "petal width (cm)",
617    ]
618
619    frame = None
620    target_columns = [
621        "target",
622    ]
623    if as_frame:
624        frame, data, target = _convert_data_dataframe(
625            "load_iris", data, target, feature_names, target_columns
626        )
627
628    if return_X_y:
629        return data, target
630
631    return Bunch(
632        data=data,
633        target=target,
634        frame=frame,
635        target_names=target_names,
636        DESCR=fdescr,
637        feature_names=feature_names,
638        filename=data_file_name,
639        data_module=DATA_MODULE,
640    )
641
642
643def load_breast_cancer(*, return_X_y=False, as_frame=False):
644    """Load and return the breast cancer wisconsin dataset (classification).
645
646    The breast cancer dataset is a classic and very easy binary classification
647    dataset.
648
649    =================   ==============
650    Classes                          2
651    Samples per class    212(M),357(B)
652    Samples total                  569
653    Dimensionality                  30
654    Features            real, positive
655    =================   ==============
656
657    Read more in the :ref:`User Guide <breast_cancer_dataset>`.
658
659    Parameters
660    ----------
661    return_X_y : bool, default=False
662        If True, returns ``(data, target)`` instead of a Bunch object.
663        See below for more information about the `data` and `target` object.
664
665        .. versionadded:: 0.18
666
667    as_frame : bool, default=False
668        If True, the data is a pandas DataFrame including columns with
669        appropriate dtypes (numeric). The target is
670        a pandas DataFrame or Series depending on the number of target columns.
671        If `return_X_y` is True, then (`data`, `target`) will be pandas
672        DataFrames or Series as described below.
673
674        .. versionadded:: 0.23
675
676    Returns
677    -------
678    data : :class:`~sklearn.utils.Bunch`
679        Dictionary-like object, with the following attributes.
680
681        data : {ndarray, dataframe} of shape (569, 30)
682            The data matrix. If `as_frame=True`, `data` will be a pandas
683            DataFrame.
684        target: {ndarray, Series} of shape (569,)
685            The classification target. If `as_frame=True`, `target` will be
686            a pandas Series.
687        feature_names: list
688            The names of the dataset columns.
689        target_names: list
690            The names of target classes.
691        frame: DataFrame of shape (569, 31)
692            Only present when `as_frame=True`. DataFrame with `data` and
693            `target`.
694
695            .. versionadded:: 0.23
696        DESCR: str
697            The full description of the dataset.
698        filename: str
699            The path to the location of the data.
700
701            .. versionadded:: 0.20
702
703    (data, target) : tuple if ``return_X_y`` is True
704
705        .. versionadded:: 0.18
706
707    The copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is
708    downloaded from:
709    https://goo.gl/U2Uwz2
710
711    Examples
712    --------
713    Let's say you are interested in the samples 10, 50, and 85, and want to
714    know their class name.
715
716    >>> from sklearn.datasets import load_breast_cancer
717    >>> data = load_breast_cancer()
718    >>> data.target[[10, 50, 85]]
719    array([0, 1, 0])
720    >>> list(data.target_names)
721    ['malignant', 'benign']
722    """
723    data_file_name = "breast_cancer.csv"
724    data, target, target_names, fdescr = load_csv_data(
725        data_file_name=data_file_name, descr_file_name="breast_cancer.rst"
726    )
727
728    feature_names = np.array(
729        [
730            "mean radius",
731            "mean texture",
732            "mean perimeter",
733            "mean area",
734            "mean smoothness",
735            "mean compactness",
736            "mean concavity",
737            "mean concave points",
738            "mean symmetry",
739            "mean fractal dimension",
740            "radius error",
741            "texture error",
742            "perimeter error",
743            "area error",
744            "smoothness error",
745            "compactness error",
746            "concavity error",
747            "concave points error",
748            "symmetry error",
749            "fractal dimension error",
750            "worst radius",
751            "worst texture",
752            "worst perimeter",
753            "worst area",
754            "worst smoothness",
755            "worst compactness",
756            "worst concavity",
757            "worst concave points",
758            "worst symmetry",
759            "worst fractal dimension",
760        ]
761    )
762
763    frame = None
764    target_columns = [
765        "target",
766    ]
767    if as_frame:
768        frame, data, target = _convert_data_dataframe(
769            "load_breast_cancer", data, target, feature_names, target_columns
770        )
771
772    if return_X_y:
773        return data, target
774
775    return Bunch(
776        data=data,
777        target=target,
778        frame=frame,
779        target_names=target_names,
780        DESCR=fdescr,
781        feature_names=feature_names,
782        filename=data_file_name,
783        data_module=DATA_MODULE,
784    )
785
786
787def load_digits(*, n_class=10, return_X_y=False, as_frame=False):
788    """Load and return the digits dataset (classification).
789
790    Each datapoint is a 8x8 image of a digit.
791
792    =================   ==============
793    Classes                         10
794    Samples per class             ~180
795    Samples total                 1797
796    Dimensionality                  64
797    Features             integers 0-16
798    =================   ==============
799
800    Read more in the :ref:`User Guide <digits_dataset>`.
801
802    Parameters
803    ----------
804    n_class : int, default=10
805        The number of classes to return. Between 0 and 10.
806
807    return_X_y : bool, default=False
808        If True, returns ``(data, target)`` instead of a Bunch object.
809        See below for more information about the `data` and `target` object.
810
811        .. versionadded:: 0.18
812
813    as_frame : bool, default=False
814        If True, the data is a pandas DataFrame including columns with
815        appropriate dtypes (numeric). The target is
816        a pandas DataFrame or Series depending on the number of target columns.
817        If `return_X_y` is True, then (`data`, `target`) will be pandas
818        DataFrames or Series as described below.
819
820        .. versionadded:: 0.23
821
822    Returns
823    -------
824    data : :class:`~sklearn.utils.Bunch`
825        Dictionary-like object, with the following attributes.
826
827        data : {ndarray, dataframe} of shape (1797, 64)
828            The flattened data matrix. If `as_frame=True`, `data` will be
829            a pandas DataFrame.
830        target: {ndarray, Series} of shape (1797,)
831            The classification target. If `as_frame=True`, `target` will be
832            a pandas Series.
833        feature_names: list
834            The names of the dataset columns.
835        target_names: list
836            The names of target classes.
837
838            .. versionadded:: 0.20
839
840        frame: DataFrame of shape (1797, 65)
841            Only present when `as_frame=True`. DataFrame with `data` and
842            `target`.
843
844            .. versionadded:: 0.23
845        images: {ndarray} of shape (1797, 8, 8)
846            The raw image data.
847        DESCR: str
848            The full description of the dataset.
849
850    (data, target) : tuple if ``return_X_y`` is True
851
852        .. versionadded:: 0.18
853
854    This is a copy of the test set of the UCI ML hand-written digits datasets
855    https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
856
857    Examples
858    --------
859    To load the data and visualize the images::
860
861        >>> from sklearn.datasets import load_digits
862        >>> digits = load_digits()
863        >>> print(digits.data.shape)
864        (1797, 64)
865        >>> import matplotlib.pyplot as plt
866        >>> plt.gray()
867        >>> plt.matshow(digits.images[0])
868        <...>
869        >>> plt.show()
870    """
871
872    data, fdescr = load_gzip_compressed_csv_data(
873        data_file_name="digits.csv.gz", descr_file_name="digits.rst", delimiter=","
874    )
875
876    target = data[:, -1].astype(int, copy=False)
877    flat_data = data[:, :-1]
878    images = flat_data.view()
879    images.shape = (-1, 8, 8)
880
881    if n_class < 10:
882        idx = target < n_class
883        flat_data, target = flat_data[idx], target[idx]
884        images = images[idx]
885
886    feature_names = [
887        "pixel_{}_{}".format(row_idx, col_idx)
888        for row_idx in range(8)
889        for col_idx in range(8)
890    ]
891
892    frame = None
893    target_columns = [
894        "target",
895    ]
896    if as_frame:
897        frame, flat_data, target = _convert_data_dataframe(
898            "load_digits", flat_data, target, feature_names, target_columns
899        )
900
901    if return_X_y:
902        return flat_data, target
903
904    return Bunch(
905        data=flat_data,
906        target=target,
907        frame=frame,
908        feature_names=feature_names,
909        target_names=np.arange(10),
910        images=images,
911        DESCR=fdescr,
912    )
913
914
915def load_diabetes(*, return_X_y=False, as_frame=False):
916    """Load and return the diabetes dataset (regression).
917
918    ==============   ==================
919    Samples total    442
920    Dimensionality   10
921    Features         real, -.2 < x < .2
922    Targets          integer 25 - 346
923    ==============   ==================
924
925    .. note::
926       The meaning of each feature (i.e. `feature_names`) might be unclear
927       (especially for `ltg`) as the documentation of the original dataset is
928       not explicit. We provide information that seems correct in regard with
929       the scientific literature in this field of research.
930
931    Read more in the :ref:`User Guide <diabetes_dataset>`.
932
933    Parameters
934    ----------
935    return_X_y : bool, default=False
936        If True, returns ``(data, target)`` instead of a Bunch object.
937        See below for more information about the `data` and `target` object.
938
939        .. versionadded:: 0.18
940
941    as_frame : bool, default=False
942        If True, the data is a pandas DataFrame including columns with
943        appropriate dtypes (numeric). The target is
944        a pandas DataFrame or Series depending on the number of target columns.
945        If `return_X_y` is True, then (`data`, `target`) will be pandas
946        DataFrames or Series as described below.
947
948        .. versionadded:: 0.23
949
950    Returns
951    -------
952    data : :class:`~sklearn.utils.Bunch`
953        Dictionary-like object, with the following attributes.
954
955        data : {ndarray, dataframe} of shape (442, 10)
956            The data matrix. If `as_frame=True`, `data` will be a pandas
957            DataFrame.
958        target: {ndarray, Series} of shape (442,)
959            The regression target. If `as_frame=True`, `target` will be
960            a pandas Series.
961        feature_names: list
962            The names of the dataset columns.
963        frame: DataFrame of shape (442, 11)
964            Only present when `as_frame=True`. DataFrame with `data` and
965            `target`.
966
967            .. versionadded:: 0.23
968        DESCR: str
969            The full description of the dataset.
970        data_filename: str
971            The path to the location of the data.
972        target_filename: str
973            The path to the location of the target.
974
975    (data, target) : tuple if ``return_X_y`` is True
976        Returns a tuple of two ndarray of shape (n_samples, n_features)
977        A 2D array with each row representing one sample and each column
978        representing the features and/or target of a given sample.
979        .. versionadded:: 0.18
980    """
981    data_filename = "diabetes_data.csv.gz"
982    target_filename = "diabetes_target.csv.gz"
983    data = load_gzip_compressed_csv_data(data_filename)
984    target = load_gzip_compressed_csv_data(target_filename)
985
986    fdescr = load_descr("diabetes.rst")
987
988    feature_names = ["age", "sex", "bmi", "bp", "s1", "s2", "s3", "s4", "s5", "s6"]
989
990    frame = None
991    target_columns = [
992        "target",
993    ]
994    if as_frame:
995        frame, data, target = _convert_data_dataframe(
996            "load_diabetes", data, target, feature_names, target_columns
997        )
998
999    if return_X_y:
1000        return data, target
1001
1002    return Bunch(
1003        data=data,
1004        target=target,
1005        frame=frame,
1006        DESCR=fdescr,
1007        feature_names=feature_names,
1008        data_filename=data_filename,
1009        target_filename=target_filename,
1010        data_module=DATA_MODULE,
1011    )
1012
1013
1014def load_linnerud(*, return_X_y=False, as_frame=False):
1015    """Load and return the physical exercise Linnerud dataset.
1016
1017    This dataset is suitable for multi-ouput regression tasks.
1018
1019    ==============   ============================
1020    Samples total    20
1021    Dimensionality   3 (for both data and target)
1022    Features         integer
1023    Targets          integer
1024    ==============   ============================
1025
1026    Read more in the :ref:`User Guide <linnerrud_dataset>`.
1027
1028    Parameters
1029    ----------
1030    return_X_y : bool, default=False
1031        If True, returns ``(data, target)`` instead of a Bunch object.
1032        See below for more information about the `data` and `target` object.
1033
1034        .. versionadded:: 0.18
1035
1036    as_frame : bool, default=False
1037        If True, the data is a pandas DataFrame including columns with
1038        appropriate dtypes (numeric, string or categorical). The target is
1039        a pandas DataFrame or Series depending on the number of target columns.
1040        If `return_X_y` is True, then (`data`, `target`) will be pandas
1041        DataFrames or Series as described below.
1042
1043        .. versionadded:: 0.23
1044
1045    Returns
1046    -------
1047    data : :class:`~sklearn.utils.Bunch`
1048        Dictionary-like object, with the following attributes.
1049
1050        data : {ndarray, dataframe} of shape (20, 3)
1051            The data matrix. If `as_frame=True`, `data` will be a pandas
1052            DataFrame.
1053        target: {ndarray, dataframe} of shape (20, 3)
1054            The regression targets. If `as_frame=True`, `target` will be
1055            a pandas DataFrame.
1056        feature_names: list
1057            The names of the dataset columns.
1058        target_names: list
1059            The names of the target columns.
1060        frame: DataFrame of shape (20, 6)
1061            Only present when `as_frame=True`. DataFrame with `data` and
1062            `target`.
1063
1064            .. versionadded:: 0.23
1065        DESCR: str
1066            The full description of the dataset.
1067        data_filename: str
1068            The path to the location of the data.
1069        target_filename: str
1070            The path to the location of the target.
1071
1072            .. versionadded:: 0.20
1073
1074    (data, target) : tuple if ``return_X_y`` is True
1075
1076        .. versionadded:: 0.18
1077    """
1078    data_filename = "linnerud_exercise.csv"
1079    target_filename = "linnerud_physiological.csv"
1080
1081    # Read header and data
1082    with resources.open_text(DATA_MODULE, data_filename) as f:
1083        header_exercise = f.readline().split()
1084        f.seek(0)  # reset file obj
1085        data_exercise = np.loadtxt(f, skiprows=1)
1086
1087    with resources.open_text(DATA_MODULE, target_filename) as f:
1088        header_physiological = f.readline().split()
1089        f.seek(0)  # reset file obj
1090        data_physiological = np.loadtxt(f, skiprows=1)
1091
1092    fdescr = load_descr("linnerud.rst")
1093
1094    frame = None
1095    if as_frame:
1096        (frame, data_exercise, data_physiological) = _convert_data_dataframe(
1097            "load_linnerud",
1098            data_exercise,
1099            data_physiological,
1100            header_exercise,
1101            header_physiological,
1102        )
1103    if return_X_y:
1104        return data_exercise, data_physiological
1105
1106    return Bunch(
1107        data=data_exercise,
1108        feature_names=header_exercise,
1109        target=data_physiological,
1110        target_names=header_physiological,
1111        frame=frame,
1112        DESCR=fdescr,
1113        data_filename=data_filename,
1114        target_filename=target_filename,
1115        data_module=DATA_MODULE,
1116    )
1117
1118
1119@deprecated(
1120    r"""`load_boston` is deprecated in 1.0 and will be removed in 1.2.
1121
1122    The Boston housing prices dataset has an ethical problem. You can refer to
1123    the documentation of this function for further details.
1124
1125    The scikit-learn maintainers therefore strongly discourage the use of this
1126    dataset unless the purpose of the code is to study and educate about
1127    ethical issues in data science and machine learning.
1128
1129    In this special case, you can fetch the dataset from the original
1130    source::
1131
1132        import pandas as pd
1133        import numpy as np
1134
1135
1136        data_url = "http://lib.stat.cmu.edu/datasets/boston"
1137        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
1138        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
1139        target = raw_df.values[1::2, 2]
1140
1141    Alternative datasets include the California housing dataset (i.e.
1142    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
1143    dataset. You can load the datasets as follows::
1144
1145        from sklearn.datasets import fetch_california_housing
1146        housing = fetch_california_housing()
1147
1148    for the California housing dataset and::
1149
1150        from sklearn.datasets import fetch_openml
1151        housing = fetch_openml(name="house_prices", as_frame=True)
1152
1153    for the Ames housing dataset.
1154    """
1155)
1156def load_boston(*, return_X_y=False):
1157    r"""Load and return the boston house-prices dataset (regression).
1158
1159    ==============   ==============
1160    Samples total               506
1161    Dimensionality               13
1162    Features         real, positive
1163    Targets           real 5. - 50.
1164    ==============   ==============
1165
1166    Read more in the :ref:`User Guide <boston_dataset>`.
1167
1168    .. deprecated:: 1.0
1169       This function is deprecated in 1.0 and will be removed in 1.2. See the
1170       warning message below for further details regarding the alternative
1171       datasets.
1172
1173    .. warning::
1174        The Boston housing prices dataset has an ethical problem: as
1175        investigated in [1]_, the authors of this dataset engineered a
1176        non-invertible variable "B" assuming that racial self-segregation had a
1177        positive impact on house prices [2]_. Furthermore the goal of the
1178        research that led to the creation of this dataset was to study the
1179        impact of air quality but it did not give adequate demonstration of the
1180        validity of this assumption.
1181
1182        The scikit-learn maintainers therefore strongly discourage the use of
1183        this dataset unless the purpose of the code is to study and educate
1184        about ethical issues in data science and machine learning.
1185
1186        In this special case, you can fetch the dataset from the original
1187        source::
1188
1189            import pandas as pd  # doctest: +SKIP
1190            import numpy as np
1191
1192
1193            data_url = "http://lib.stat.cmu.edu/datasets/boston"
1194            raw_df = pd.read_csv(data_url, sep="s+", skiprows=22, header=None)
1195            data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
1196            target = raw_df.values[1::2, 2]
1197
1198        Alternative datasets include the California housing dataset [3]_
1199        (i.e. :func:`~sklearn.datasets.fetch_california_housing`) and Ames
1200        housing dataset [4]_. You can load the datasets as follows::
1201
1202            from sklearn.datasets import fetch_california_housing
1203            housing = fetch_california_housing()
1204
1205        for the California housing dataset and::
1206
1207            from sklearn.datasets import fetch_openml
1208            housing = fetch_openml(name="house_prices", as_frame=True)  # noqa
1209
1210        for the Ames housing dataset.
1211
1212    Parameters
1213    ----------
1214    return_X_y : bool, default=False
1215        If True, returns ``(data, target)`` instead of a Bunch object.
1216        See below for more information about the `data` and `target` object.
1217
1218        .. versionadded:: 0.18
1219
1220    Returns
1221    -------
1222    data : :class:`~sklearn.utils.Bunch`
1223        Dictionary-like object, with the following attributes.
1224
1225        data : ndarray of shape (506, 13)
1226            The data matrix.
1227        target : ndarray of shape (506,)
1228            The regression target.
1229        filename : str
1230            The physical location of boston csv dataset.
1231
1232            .. versionadded:: 0.20
1233
1234        DESCR : str
1235            The full description of the dataset.
1236        feature_names : ndarray
1237            The names of features
1238
1239    (data, target) : tuple if ``return_X_y`` is True
1240
1241        .. versionadded:: 0.18
1242
1243    Notes
1244    -----
1245        .. versionchanged:: 0.20
1246            Fixed a wrong data point at [445, 0].
1247
1248    References
1249    ----------
1250    .. [1] `Racist data destruction? M Carlisle,
1251            <https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>`_
1252    .. [2] `Harrison Jr, David, and Daniel L. Rubinfeld.
1253           "Hedonic housing prices and the demand for clean air."
1254           Journal of environmental economics and management 5.1 (1978): 81-102.
1255           <https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>`_
1256    .. [3] `California housing dataset
1257            <https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset>`_
1258    .. [4] `Ames housing dataset
1259            <https://www.openml.org/d/42165>`_
1260
1261    Examples
1262    --------
1263    >>> import warnings
1264    >>> from sklearn.datasets import load_boston
1265    >>> with warnings.catch_warnings():
1266    ...     # You should probably not use this dataset.
1267    ...     warnings.filterwarnings("ignore")
1268    ...     X, y = load_boston(return_X_y=True)
1269    >>> print(X.shape)
1270    (506, 13)
1271    """
1272    # TODO: once the deprecation period is over, implement a module level
1273    # `__getattr__` function in`sklearn.datasets` to raise an exception with
1274    # an informative error message at import time instead of just removing
1275    # load_boston. The goal is to avoid having beginners that copy-paste code
1276    # from numerous books and tutorials that use this dataset loader get
1277    # a confusing ImportError when trying to learn scikit-learn.
1278    # See: https://www.python.org/dev/peps/pep-0562/
1279
1280    descr_text = load_descr("boston_house_prices.rst")
1281
1282    data_file_name = "boston_house_prices.csv"
1283    with resources.open_text(DATA_MODULE, data_file_name) as f:
1284        data_file = csv.reader(f)
1285        temp = next(data_file)
1286        n_samples = int(temp[0])
1287        n_features = int(temp[1])
1288        data = np.empty((n_samples, n_features))
1289        target = np.empty((n_samples,))
1290        temp = next(data_file)  # names of features
1291        feature_names = np.array(temp)
1292
1293        for i, d in enumerate(data_file):
1294            data[i] = np.asarray(d[:-1], dtype=np.float64)
1295            target[i] = np.asarray(d[-1], dtype=np.float64)
1296
1297    if return_X_y:
1298        return data, target
1299
1300    return Bunch(
1301        data=data,
1302        target=target,
1303        # last column is target value
1304        feature_names=feature_names[:-1],
1305        DESCR=descr_text,
1306        filename=data_file_name,
1307        data_module=DATA_MODULE,
1308    )
1309
1310
1311def load_sample_images():
1312    """Load sample images for image manipulation.
1313
1314    Loads both, ``china`` and ``flower``.
1315
1316    Read more in the :ref:`User Guide <sample_images>`.
1317
1318    Returns
1319    -------
1320    data : :class:`~sklearn.utils.Bunch`
1321        Dictionary-like object, with the following attributes.
1322
1323        images : list of ndarray of shape (427, 640, 3)
1324            The two sample image.
1325        filenames : list
1326            The filenames for the images.
1327        DESCR : str
1328            The full description of the dataset.
1329
1330    Examples
1331    --------
1332    To load the data and visualize the images:
1333
1334    >>> from sklearn.datasets import load_sample_images
1335    >>> dataset = load_sample_images()     #doctest: +SKIP
1336    >>> len(dataset.images)                #doctest: +SKIP
1337    2
1338    >>> first_img_data = dataset.images[0] #doctest: +SKIP
1339    >>> first_img_data.shape               #doctest: +SKIP
1340    (427, 640, 3)
1341    >>> first_img_data.dtype               #doctest: +SKIP
1342    dtype('uint8')
1343    """
1344    # import PIL only when needed
1345    from ..externals._pilutil import imread
1346
1347    descr = load_descr("README.txt", descr_module=IMAGES_MODULE)
1348
1349    filenames, images = [], []
1350    for filename in sorted(resources.contents(IMAGES_MODULE)):
1351        if filename.endswith(".jpg"):
1352            filenames.append(filename)
1353            with resources.open_binary(IMAGES_MODULE, filename) as image_file:
1354                image = imread(image_file)
1355            images.append(image)
1356
1357    return Bunch(images=images, filenames=filenames, DESCR=descr)
1358
1359
1360def load_sample_image(image_name):
1361    """Load the numpy array of a single sample image
1362
1363    Read more in the :ref:`User Guide <sample_images>`.
1364
1365    Parameters
1366    ----------
1367    image_name : {`china.jpg`, `flower.jpg`}
1368        The name of the sample image loaded
1369
1370    Returns
1371    -------
1372    img : 3D array
1373        The image as a numpy array: height x width x color
1374
1375    Examples
1376    --------
1377
1378    >>> from sklearn.datasets import load_sample_image
1379    >>> china = load_sample_image('china.jpg')   # doctest: +SKIP
1380    >>> china.dtype                              # doctest: +SKIP
1381    dtype('uint8')
1382    >>> china.shape                              # doctest: +SKIP
1383    (427, 640, 3)
1384    >>> flower = load_sample_image('flower.jpg') # doctest: +SKIP
1385    >>> flower.dtype                             # doctest: +SKIP
1386    dtype('uint8')
1387    >>> flower.shape                             # doctest: +SKIP
1388    (427, 640, 3)
1389    """
1390    images = load_sample_images()
1391    index = None
1392    for i, filename in enumerate(images.filenames):
1393        if filename.endswith(image_name):
1394            index = i
1395            break
1396    if index is None:
1397        raise AttributeError("Cannot find sample image: %s" % image_name)
1398    return images.images[index]
1399
1400
1401def _pkl_filepath(*args, **kwargs):
1402    """Return filename for Python 3 pickles
1403
1404    args[-1] is expected to be the ".pkl" filename. For compatibility with
1405    older scikit-learn versions, a suffix is inserted before the extension.
1406
1407    _pkl_filepath('/path/to/folder', 'filename.pkl') returns
1408    '/path/to/folder/filename_py3.pkl'
1409
1410    """
1411    py3_suffix = kwargs.get("py3_suffix", "_py3")
1412    basename, ext = splitext(args[-1])
1413    basename += py3_suffix
1414    new_args = args[:-1] + (basename + ext,)
1415    return join(*new_args)
1416
1417
1418def _sha256(path):
1419    """Calculate the sha256 hash of the file at path."""
1420    sha256hash = hashlib.sha256()
1421    chunk_size = 8192
1422    with open(path, "rb") as f:
1423        while True:
1424            buffer = f.read(chunk_size)
1425            if not buffer:
1426                break
1427            sha256hash.update(buffer)
1428    return sha256hash.hexdigest()
1429
1430
1431def _fetch_remote(remote, dirname=None):
1432    """Helper function to download a remote dataset into path
1433
1434    Fetch a dataset pointed by remote's url, save into path using remote's
1435    filename and ensure its integrity based on the SHA256 Checksum of the
1436    downloaded file.
1437
1438    Parameters
1439    ----------
1440    remote : RemoteFileMetadata
1441        Named tuple containing remote dataset meta information: url, filename
1442        and checksum
1443
1444    dirname : str
1445        Directory to save the file to.
1446
1447    Returns
1448    -------
1449    file_path: str
1450        Full path of the created file.
1451    """
1452
1453    file_path = remote.filename if dirname is None else join(dirname, remote.filename)
1454    urlretrieve(remote.url, file_path)
1455    checksum = _sha256(file_path)
1456    if remote.checksum != checksum:
1457        raise IOError(
1458            "{} has an SHA256 checksum ({}) "
1459            "differing from expected ({}), "
1460            "file may be corrupted.".format(file_path, checksum, remote.checksum)
1461        )
1462    return file_path
1463