1""" 2Base IO code for all datasets 3""" 4 5# Copyright (c) 2007 David Cournapeau <cournape@gmail.com> 6# 2010 Fabian Pedregosa <fabian.pedregosa@inria.fr> 7# 2010 Olivier Grisel <olivier.grisel@ensta.org> 8# License: BSD 3 clause 9import csv 10import hashlib 11import gzip 12import shutil 13from collections import namedtuple 14from os import environ, listdir, makedirs 15from os.path import expanduser, isdir, join, splitext 16from importlib import resources 17 18from ..utils import Bunch 19from ..utils import check_random_state 20from ..utils import check_pandas_support 21from ..utils.deprecation import deprecated 22 23import numpy as np 24 25from urllib.request import urlretrieve 26 27DATA_MODULE = "sklearn.datasets.data" 28DESCR_MODULE = "sklearn.datasets.descr" 29IMAGES_MODULE = "sklearn.datasets.images" 30 31RemoteFileMetadata = namedtuple("RemoteFileMetadata", ["filename", "url", "checksum"]) 32 33 34def get_data_home(data_home=None) -> str: 35 """Return the path of the scikit-learn data dir. 36 37 This folder is used by some large dataset loaders to avoid downloading the 38 data several times. 39 40 By default the data dir is set to a folder named 'scikit_learn_data' in the 41 user home folder. 42 43 Alternatively, it can be set by the 'SCIKIT_LEARN_DATA' environment 44 variable or programmatically by giving an explicit folder path. The '~' 45 symbol is expanded to the user home folder. 46 47 If the folder does not already exist, it is automatically created. 48 49 Parameters 50 ---------- 51 data_home : str, default=None 52 The path to scikit-learn data directory. If `None`, the default path 53 is `~/sklearn_learn_data`. 54 """ 55 if data_home is None: 56 data_home = environ.get("SCIKIT_LEARN_DATA", join("~", "scikit_learn_data")) 57 data_home = expanduser(data_home) 58 makedirs(data_home, exist_ok=True) 59 return data_home 60 61 62def clear_data_home(data_home=None): 63 """Delete all the content of the data home cache. 64 65 Parameters 66 ---------- 67 data_home : str, default=None 68 The path to scikit-learn data directory. If `None`, the default path 69 is `~/sklearn_learn_data`. 70 """ 71 data_home = get_data_home(data_home) 72 shutil.rmtree(data_home) 73 74 75def _convert_data_dataframe( 76 caller_name, data, target, feature_names, target_names, sparse_data=False 77): 78 pd = check_pandas_support("{} with as_frame=True".format(caller_name)) 79 if not sparse_data: 80 data_df = pd.DataFrame(data, columns=feature_names) 81 else: 82 data_df = pd.DataFrame.sparse.from_spmatrix(data, columns=feature_names) 83 84 target_df = pd.DataFrame(target, columns=target_names) 85 combined_df = pd.concat([data_df, target_df], axis=1) 86 X = combined_df[feature_names] 87 y = combined_df[target_names] 88 if y.shape[1] == 1: 89 y = y.iloc[:, 0] 90 return combined_df, X, y 91 92 93def load_files( 94 container_path, 95 *, 96 description=None, 97 categories=None, 98 load_content=True, 99 shuffle=True, 100 encoding=None, 101 decode_error="strict", 102 random_state=0, 103): 104 """Load text files with categories as subfolder names. 105 106 Individual samples are assumed to be files stored a two levels folder 107 structure such as the following: 108 109 container_folder/ 110 category_1_folder/ 111 file_1.txt 112 file_2.txt 113 ... 114 file_42.txt 115 category_2_folder/ 116 file_43.txt 117 file_44.txt 118 ... 119 120 The folder names are used as supervised signal label names. The individual 121 file names are not important. 122 123 This function does not try to extract features into a numpy array or scipy 124 sparse matrix. In addition, if load_content is false it does not try to 125 load the files in memory. 126 127 To use text files in a scikit-learn classification or clustering algorithm, 128 you will need to use the :mod`~sklearn.feature_extraction.text` module to 129 build a feature extraction transformer that suits your problem. 130 131 If you set load_content=True, you should also specify the encoding of the 132 text using the 'encoding' parameter. For many modern text files, 'utf-8' 133 will be the correct encoding. If you leave encoding equal to None, then the 134 content will be made of bytes instead of Unicode, and you will not be able 135 to use most functions in :mod:`~sklearn.feature_extraction.text`. 136 137 Similar feature extractors should be built for other kind of unstructured 138 data input such as images, audio, video, ... 139 140 Read more in the :ref:`User Guide <datasets>`. 141 142 Parameters 143 ---------- 144 container_path : str 145 Path to the main folder holding one subfolder per category. 146 147 description : str, default=None 148 A paragraph describing the characteristic of the dataset: its source, 149 reference, etc. 150 151 categories : list of str, default=None 152 If None (default), load all the categories. If not None, list of 153 category names to load (other categories ignored). 154 155 load_content : bool, default=True 156 Whether to load or not the content of the different files. If true a 157 'data' attribute containing the text information is present in the data 158 structure returned. If not, a filenames attribute gives the path to the 159 files. 160 161 shuffle : bool, default=True 162 Whether or not to shuffle the data: might be important for models that 163 make the assumption that the samples are independent and identically 164 distributed (i.i.d.), such as stochastic gradient descent. 165 166 encoding : str, default=None 167 If None, do not try to decode the content of the files (e.g. for images 168 or other non-text content). If not None, encoding to use to decode text 169 files to Unicode if load_content is True. 170 171 decode_error : {'strict', 'ignore', 'replace'}, default='strict' 172 Instruction on what to do if a byte sequence is given to analyze that 173 contains characters not of the given `encoding`. Passed as keyword 174 argument 'errors' to bytes.decode. 175 176 random_state : int, RandomState instance or None, default=0 177 Determines random number generation for dataset shuffling. Pass an int 178 for reproducible output across multiple function calls. 179 See :term:`Glossary <random_state>`. 180 181 Returns 182 ------- 183 data : :class:`~sklearn.utils.Bunch` 184 Dictionary-like object, with the following attributes. 185 186 data : list of str 187 Only present when `load_content=True`. 188 The raw text data to learn. 189 target : ndarray 190 The target labels (integer index). 191 target_names : list 192 The names of target classes. 193 DESCR : str 194 The full description of the dataset. 195 filenames: ndarray 196 The filenames holding the dataset. 197 """ 198 target = [] 199 target_names = [] 200 filenames = [] 201 202 folders = [ 203 f for f in sorted(listdir(container_path)) if isdir(join(container_path, f)) 204 ] 205 206 if categories is not None: 207 folders = [f for f in folders if f in categories] 208 209 for label, folder in enumerate(folders): 210 target_names.append(folder) 211 folder_path = join(container_path, folder) 212 documents = [join(folder_path, d) for d in sorted(listdir(folder_path))] 213 target.extend(len(documents) * [label]) 214 filenames.extend(documents) 215 216 # convert to array for fancy indexing 217 filenames = np.array(filenames) 218 target = np.array(target) 219 220 if shuffle: 221 random_state = check_random_state(random_state) 222 indices = np.arange(filenames.shape[0]) 223 random_state.shuffle(indices) 224 filenames = filenames[indices] 225 target = target[indices] 226 227 if load_content: 228 data = [] 229 for filename in filenames: 230 with open(filename, "rb") as f: 231 data.append(f.read()) 232 if encoding is not None: 233 data = [d.decode(encoding, decode_error) for d in data] 234 return Bunch( 235 data=data, 236 filenames=filenames, 237 target_names=target_names, 238 target=target, 239 DESCR=description, 240 ) 241 242 return Bunch( 243 filenames=filenames, target_names=target_names, target=target, DESCR=description 244 ) 245 246 247def load_csv_data( 248 data_file_name, 249 *, 250 data_module=DATA_MODULE, 251 descr_file_name=None, 252 descr_module=DESCR_MODULE, 253): 254 """Loads `data_file_name` from `data_module with `importlib.resources`. 255 256 Parameters 257 ---------- 258 data_file_name : str 259 Name of csv file to be loaded from `data_module/data_file_name`. 260 For example `'wine_data.csv'`. 261 262 data_module : str or module, default='sklearn.datasets.data' 263 Module where data lives. The default is `'sklearn.datasets.data'`. 264 265 descr_file_name : str, default=None 266 Name of rst file to be loaded from `descr_module/descr_file_name`. 267 For example `'wine_data.rst'`. See also :func:`load_descr`. 268 If not None, also returns the corresponding description of 269 the dataset. 270 271 descr_module : str or module, default='sklearn.datasets.descr' 272 Module where `descr_file_name` lives. See also :func:`load_descr`. 273 The default is `'sklearn.datasets.descr'`. 274 275 Returns 276 ------- 277 data : ndarray of shape (n_samples, n_features) 278 A 2D array with each row representing one sample and each column 279 representing the features of a given sample. 280 281 target : ndarry of shape (n_samples,) 282 A 1D array holding target variables for all the samples in `data`. 283 For example target[0] is the target variable for data[0]. 284 285 target_names : ndarry of shape (n_samples,) 286 A 1D array containing the names of the classifications. For example 287 target_names[0] is the name of the target[0] class. 288 289 descr : str, optional 290 Description of the dataset (the content of `descr_file_name`). 291 Only returned if `descr_file_name` is not None. 292 """ 293 with resources.open_text(data_module, data_file_name) as csv_file: 294 data_file = csv.reader(csv_file) 295 temp = next(data_file) 296 n_samples = int(temp[0]) 297 n_features = int(temp[1]) 298 target_names = np.array(temp[2:]) 299 data = np.empty((n_samples, n_features)) 300 target = np.empty((n_samples,), dtype=int) 301 302 for i, ir in enumerate(data_file): 303 data[i] = np.asarray(ir[:-1], dtype=np.float64) 304 target[i] = np.asarray(ir[-1], dtype=int) 305 306 if descr_file_name is None: 307 return data, target, target_names 308 else: 309 assert descr_module is not None 310 descr = load_descr(descr_module=descr_module, descr_file_name=descr_file_name) 311 return data, target, target_names, descr 312 313 314def load_gzip_compressed_csv_data( 315 data_file_name, 316 *, 317 data_module=DATA_MODULE, 318 descr_file_name=None, 319 descr_module=DESCR_MODULE, 320 encoding="utf-8", 321 **kwargs, 322): 323 """Loads gzip-compressed `data_file_name` from `data_module` with `importlib.resources`. 324 325 1) Open resource file with `importlib.resources.open_binary` 326 2) Decompress file obj with `gzip.open` 327 3) Load decompressed data with `np.loadtxt` 328 329 Parameters 330 ---------- 331 data_file_name : str 332 Name of gzip-compressed csv file (`'*.csv.gz'`) to be loaded from 333 `data_module/data_file_name`. For example `'diabetes_data.csv.gz'`. 334 335 data_module : str or module, default='sklearn.datasets.data' 336 Module where data lives. The default is `'sklearn.datasets.data'`. 337 338 descr_file_name : str, default=None 339 Name of rst file to be loaded from `descr_module/descr_file_name`. 340 For example `'wine_data.rst'`. See also :func:`load_descr`. 341 If not None, also returns the corresponding description of 342 the dataset. 343 344 descr_module : str or module, default='sklearn.datasets.descr' 345 Module where `descr_file_name` lives. See also :func:`load_descr`. 346 The default is `'sklearn.datasets.descr'`. 347 348 encoding : str, default="utf-8" 349 Name of the encoding that the gzip-decompressed file will be 350 decoded with. The default is 'utf-8'. 351 352 **kwargs : dict, optional 353 Keyword arguments to be passed to `np.loadtxt`; 354 e.g. delimiter=','. 355 356 Returns 357 ------- 358 data : ndarray of shape (n_samples, n_features) 359 A 2D array with each row representing one sample and each column 360 representing the features and/or target of a given sample. 361 362 descr : str, optional 363 Description of the dataset (the content of `descr_file_name`). 364 Only returned if `descr_file_name` is not None. 365 """ 366 with resources.open_binary(data_module, data_file_name) as compressed_file: 367 compressed_file = gzip.open(compressed_file, mode="rt", encoding=encoding) 368 data = np.loadtxt(compressed_file, **kwargs) 369 370 if descr_file_name is None: 371 return data 372 else: 373 assert descr_module is not None 374 descr = load_descr(descr_module=descr_module, descr_file_name=descr_file_name) 375 return data, descr 376 377 378def load_descr(descr_file_name, *, descr_module=DESCR_MODULE): 379 """Load `descr_file_name` from `descr_module` with `importlib.resources`. 380 381 Parameters 382 ---------- 383 descr_file_name : str, default=None 384 Name of rst file to be loaded from `descr_module/descr_file_name`. 385 For example `'wine_data.rst'`. See also :func:`load_descr`. 386 If not None, also returns the corresponding description of 387 the dataset. 388 389 descr_module : str or module, default='sklearn.datasets.descr' 390 Module where `descr_file_name` lives. See also :func:`load_descr`. 391 The default is `'sklearn.datasets.descr'`. 392 393 Returns 394 ------- 395 fdescr : str 396 Content of `descr_file_name`. 397 """ 398 fdescr = resources.read_text(descr_module, descr_file_name) 399 400 return fdescr 401 402 403def load_wine(*, return_X_y=False, as_frame=False): 404 """Load and return the wine dataset (classification). 405 406 .. versionadded:: 0.18 407 408 The wine dataset is a classic and very easy multi-class classification 409 dataset. 410 411 ================= ============== 412 Classes 3 413 Samples per class [59,71,48] 414 Samples total 178 415 Dimensionality 13 416 Features real, positive 417 ================= ============== 418 419 Read more in the :ref:`User Guide <wine_dataset>`. 420 421 Parameters 422 ---------- 423 return_X_y : bool, default=False 424 If True, returns ``(data, target)`` instead of a Bunch object. 425 See below for more information about the `data` and `target` object. 426 427 as_frame : bool, default=False 428 If True, the data is a pandas DataFrame including columns with 429 appropriate dtypes (numeric). The target is 430 a pandas DataFrame or Series depending on the number of target columns. 431 If `return_X_y` is True, then (`data`, `target`) will be pandas 432 DataFrames or Series as described below. 433 434 .. versionadded:: 0.23 435 436 Returns 437 ------- 438 data : :class:`~sklearn.utils.Bunch` 439 Dictionary-like object, with the following attributes. 440 441 data : {ndarray, dataframe} of shape (178, 13) 442 The data matrix. If `as_frame=True`, `data` will be a pandas 443 DataFrame. 444 target: {ndarray, Series} of shape (178,) 445 The classification target. If `as_frame=True`, `target` will be 446 a pandas Series. 447 feature_names: list 448 The names of the dataset columns. 449 target_names: list 450 The names of target classes. 451 frame: DataFrame of shape (178, 14) 452 Only present when `as_frame=True`. DataFrame with `data` and 453 `target`. 454 455 .. versionadded:: 0.23 456 DESCR: str 457 The full description of the dataset. 458 459 (data, target) : tuple if ``return_X_y`` is True 460 461 The copy of UCI ML Wine Data Set dataset is downloaded and modified to fit 462 standard format from: 463 https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data 464 465 Examples 466 -------- 467 Let's say you are interested in the samples 10, 80, and 140, and want to 468 know their class name. 469 470 >>> from sklearn.datasets import load_wine 471 >>> data = load_wine() 472 >>> data.target[[10, 80, 140]] 473 array([0, 1, 2]) 474 >>> list(data.target_names) 475 ['class_0', 'class_1', 'class_2'] 476 """ 477 478 data, target, target_names, fdescr = load_csv_data( 479 data_file_name="wine_data.csv", descr_file_name="wine_data.rst" 480 ) 481 482 feature_names = [ 483 "alcohol", 484 "malic_acid", 485 "ash", 486 "alcalinity_of_ash", 487 "magnesium", 488 "total_phenols", 489 "flavanoids", 490 "nonflavanoid_phenols", 491 "proanthocyanins", 492 "color_intensity", 493 "hue", 494 "od280/od315_of_diluted_wines", 495 "proline", 496 ] 497 498 frame = None 499 target_columns = [ 500 "target", 501 ] 502 if as_frame: 503 frame, data, target = _convert_data_dataframe( 504 "load_wine", data, target, feature_names, target_columns 505 ) 506 507 if return_X_y: 508 return data, target 509 510 return Bunch( 511 data=data, 512 target=target, 513 frame=frame, 514 target_names=target_names, 515 DESCR=fdescr, 516 feature_names=feature_names, 517 ) 518 519 520def load_iris(*, return_X_y=False, as_frame=False): 521 """Load and return the iris dataset (classification). 522 523 The iris dataset is a classic and very easy multi-class classification 524 dataset. 525 526 ================= ============== 527 Classes 3 528 Samples per class 50 529 Samples total 150 530 Dimensionality 4 531 Features real, positive 532 ================= ============== 533 534 Read more in the :ref:`User Guide <iris_dataset>`. 535 536 Parameters 537 ---------- 538 return_X_y : bool, default=False 539 If True, returns ``(data, target)`` instead of a Bunch object. See 540 below for more information about the `data` and `target` object. 541 542 .. versionadded:: 0.18 543 544 as_frame : bool, default=False 545 If True, the data is a pandas DataFrame including columns with 546 appropriate dtypes (numeric). The target is 547 a pandas DataFrame or Series depending on the number of target columns. 548 If `return_X_y` is True, then (`data`, `target`) will be pandas 549 DataFrames or Series as described below. 550 551 .. versionadded:: 0.23 552 553 Returns 554 ------- 555 data : :class:`~sklearn.utils.Bunch` 556 Dictionary-like object, with the following attributes. 557 558 data : {ndarray, dataframe} of shape (150, 4) 559 The data matrix. If `as_frame=True`, `data` will be a pandas 560 DataFrame. 561 target: {ndarray, Series} of shape (150,) 562 The classification target. If `as_frame=True`, `target` will be 563 a pandas Series. 564 feature_names: list 565 The names of the dataset columns. 566 target_names: list 567 The names of target classes. 568 frame: DataFrame of shape (150, 5) 569 Only present when `as_frame=True`. DataFrame with `data` and 570 `target`. 571 572 .. versionadded:: 0.23 573 DESCR: str 574 The full description of the dataset. 575 filename: str 576 The path to the location of the data. 577 578 .. versionadded:: 0.20 579 580 (data, target) : tuple if ``return_X_y`` is True 581 A tuple of two ndarray. The first containing a 2D array of shape 582 (n_samples, n_features) with each row representing one sample and 583 each column representing the features. The second ndarray of shape 584 (n_samples,) containing the target samples. 585 586 .. versionadded:: 0.18 587 588 Notes 589 ----- 590 .. versionchanged:: 0.20 591 Fixed two wrong data points according to Fisher's paper. 592 The new version is the same as in R, but not as in the UCI 593 Machine Learning Repository. 594 595 Examples 596 -------- 597 Let's say you are interested in the samples 10, 25, and 50, and want to 598 know their class name. 599 600 >>> from sklearn.datasets import load_iris 601 >>> data = load_iris() 602 >>> data.target[[10, 25, 50]] 603 array([0, 0, 1]) 604 >>> list(data.target_names) 605 ['setosa', 'versicolor', 'virginica'] 606 """ 607 data_file_name = "iris.csv" 608 data, target, target_names, fdescr = load_csv_data( 609 data_file_name=data_file_name, descr_file_name="iris.rst" 610 ) 611 612 feature_names = [ 613 "sepal length (cm)", 614 "sepal width (cm)", 615 "petal length (cm)", 616 "petal width (cm)", 617 ] 618 619 frame = None 620 target_columns = [ 621 "target", 622 ] 623 if as_frame: 624 frame, data, target = _convert_data_dataframe( 625 "load_iris", data, target, feature_names, target_columns 626 ) 627 628 if return_X_y: 629 return data, target 630 631 return Bunch( 632 data=data, 633 target=target, 634 frame=frame, 635 target_names=target_names, 636 DESCR=fdescr, 637 feature_names=feature_names, 638 filename=data_file_name, 639 data_module=DATA_MODULE, 640 ) 641 642 643def load_breast_cancer(*, return_X_y=False, as_frame=False): 644 """Load and return the breast cancer wisconsin dataset (classification). 645 646 The breast cancer dataset is a classic and very easy binary classification 647 dataset. 648 649 ================= ============== 650 Classes 2 651 Samples per class 212(M),357(B) 652 Samples total 569 653 Dimensionality 30 654 Features real, positive 655 ================= ============== 656 657 Read more in the :ref:`User Guide <breast_cancer_dataset>`. 658 659 Parameters 660 ---------- 661 return_X_y : bool, default=False 662 If True, returns ``(data, target)`` instead of a Bunch object. 663 See below for more information about the `data` and `target` object. 664 665 .. versionadded:: 0.18 666 667 as_frame : bool, default=False 668 If True, the data is a pandas DataFrame including columns with 669 appropriate dtypes (numeric). The target is 670 a pandas DataFrame or Series depending on the number of target columns. 671 If `return_X_y` is True, then (`data`, `target`) will be pandas 672 DataFrames or Series as described below. 673 674 .. versionadded:: 0.23 675 676 Returns 677 ------- 678 data : :class:`~sklearn.utils.Bunch` 679 Dictionary-like object, with the following attributes. 680 681 data : {ndarray, dataframe} of shape (569, 30) 682 The data matrix. If `as_frame=True`, `data` will be a pandas 683 DataFrame. 684 target: {ndarray, Series} of shape (569,) 685 The classification target. If `as_frame=True`, `target` will be 686 a pandas Series. 687 feature_names: list 688 The names of the dataset columns. 689 target_names: list 690 The names of target classes. 691 frame: DataFrame of shape (569, 31) 692 Only present when `as_frame=True`. DataFrame with `data` and 693 `target`. 694 695 .. versionadded:: 0.23 696 DESCR: str 697 The full description of the dataset. 698 filename: str 699 The path to the location of the data. 700 701 .. versionadded:: 0.20 702 703 (data, target) : tuple if ``return_X_y`` is True 704 705 .. versionadded:: 0.18 706 707 The copy of UCI ML Breast Cancer Wisconsin (Diagnostic) dataset is 708 downloaded from: 709 https://goo.gl/U2Uwz2 710 711 Examples 712 -------- 713 Let's say you are interested in the samples 10, 50, and 85, and want to 714 know their class name. 715 716 >>> from sklearn.datasets import load_breast_cancer 717 >>> data = load_breast_cancer() 718 >>> data.target[[10, 50, 85]] 719 array([0, 1, 0]) 720 >>> list(data.target_names) 721 ['malignant', 'benign'] 722 """ 723 data_file_name = "breast_cancer.csv" 724 data, target, target_names, fdescr = load_csv_data( 725 data_file_name=data_file_name, descr_file_name="breast_cancer.rst" 726 ) 727 728 feature_names = np.array( 729 [ 730 "mean radius", 731 "mean texture", 732 "mean perimeter", 733 "mean area", 734 "mean smoothness", 735 "mean compactness", 736 "mean concavity", 737 "mean concave points", 738 "mean symmetry", 739 "mean fractal dimension", 740 "radius error", 741 "texture error", 742 "perimeter error", 743 "area error", 744 "smoothness error", 745 "compactness error", 746 "concavity error", 747 "concave points error", 748 "symmetry error", 749 "fractal dimension error", 750 "worst radius", 751 "worst texture", 752 "worst perimeter", 753 "worst area", 754 "worst smoothness", 755 "worst compactness", 756 "worst concavity", 757 "worst concave points", 758 "worst symmetry", 759 "worst fractal dimension", 760 ] 761 ) 762 763 frame = None 764 target_columns = [ 765 "target", 766 ] 767 if as_frame: 768 frame, data, target = _convert_data_dataframe( 769 "load_breast_cancer", data, target, feature_names, target_columns 770 ) 771 772 if return_X_y: 773 return data, target 774 775 return Bunch( 776 data=data, 777 target=target, 778 frame=frame, 779 target_names=target_names, 780 DESCR=fdescr, 781 feature_names=feature_names, 782 filename=data_file_name, 783 data_module=DATA_MODULE, 784 ) 785 786 787def load_digits(*, n_class=10, return_X_y=False, as_frame=False): 788 """Load and return the digits dataset (classification). 789 790 Each datapoint is a 8x8 image of a digit. 791 792 ================= ============== 793 Classes 10 794 Samples per class ~180 795 Samples total 1797 796 Dimensionality 64 797 Features integers 0-16 798 ================= ============== 799 800 Read more in the :ref:`User Guide <digits_dataset>`. 801 802 Parameters 803 ---------- 804 n_class : int, default=10 805 The number of classes to return. Between 0 and 10. 806 807 return_X_y : bool, default=False 808 If True, returns ``(data, target)`` instead of a Bunch object. 809 See below for more information about the `data` and `target` object. 810 811 .. versionadded:: 0.18 812 813 as_frame : bool, default=False 814 If True, the data is a pandas DataFrame including columns with 815 appropriate dtypes (numeric). The target is 816 a pandas DataFrame or Series depending on the number of target columns. 817 If `return_X_y` is True, then (`data`, `target`) will be pandas 818 DataFrames or Series as described below. 819 820 .. versionadded:: 0.23 821 822 Returns 823 ------- 824 data : :class:`~sklearn.utils.Bunch` 825 Dictionary-like object, with the following attributes. 826 827 data : {ndarray, dataframe} of shape (1797, 64) 828 The flattened data matrix. If `as_frame=True`, `data` will be 829 a pandas DataFrame. 830 target: {ndarray, Series} of shape (1797,) 831 The classification target. If `as_frame=True`, `target` will be 832 a pandas Series. 833 feature_names: list 834 The names of the dataset columns. 835 target_names: list 836 The names of target classes. 837 838 .. versionadded:: 0.20 839 840 frame: DataFrame of shape (1797, 65) 841 Only present when `as_frame=True`. DataFrame with `data` and 842 `target`. 843 844 .. versionadded:: 0.23 845 images: {ndarray} of shape (1797, 8, 8) 846 The raw image data. 847 DESCR: str 848 The full description of the dataset. 849 850 (data, target) : tuple if ``return_X_y`` is True 851 852 .. versionadded:: 0.18 853 854 This is a copy of the test set of the UCI ML hand-written digits datasets 855 https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits 856 857 Examples 858 -------- 859 To load the data and visualize the images:: 860 861 >>> from sklearn.datasets import load_digits 862 >>> digits = load_digits() 863 >>> print(digits.data.shape) 864 (1797, 64) 865 >>> import matplotlib.pyplot as plt 866 >>> plt.gray() 867 >>> plt.matshow(digits.images[0]) 868 <...> 869 >>> plt.show() 870 """ 871 872 data, fdescr = load_gzip_compressed_csv_data( 873 data_file_name="digits.csv.gz", descr_file_name="digits.rst", delimiter="," 874 ) 875 876 target = data[:, -1].astype(int, copy=False) 877 flat_data = data[:, :-1] 878 images = flat_data.view() 879 images.shape = (-1, 8, 8) 880 881 if n_class < 10: 882 idx = target < n_class 883 flat_data, target = flat_data[idx], target[idx] 884 images = images[idx] 885 886 feature_names = [ 887 "pixel_{}_{}".format(row_idx, col_idx) 888 for row_idx in range(8) 889 for col_idx in range(8) 890 ] 891 892 frame = None 893 target_columns = [ 894 "target", 895 ] 896 if as_frame: 897 frame, flat_data, target = _convert_data_dataframe( 898 "load_digits", flat_data, target, feature_names, target_columns 899 ) 900 901 if return_X_y: 902 return flat_data, target 903 904 return Bunch( 905 data=flat_data, 906 target=target, 907 frame=frame, 908 feature_names=feature_names, 909 target_names=np.arange(10), 910 images=images, 911 DESCR=fdescr, 912 ) 913 914 915def load_diabetes(*, return_X_y=False, as_frame=False): 916 """Load and return the diabetes dataset (regression). 917 918 ============== ================== 919 Samples total 442 920 Dimensionality 10 921 Features real, -.2 < x < .2 922 Targets integer 25 - 346 923 ============== ================== 924 925 .. note:: 926 The meaning of each feature (i.e. `feature_names`) might be unclear 927 (especially for `ltg`) as the documentation of the original dataset is 928 not explicit. We provide information that seems correct in regard with 929 the scientific literature in this field of research. 930 931 Read more in the :ref:`User Guide <diabetes_dataset>`. 932 933 Parameters 934 ---------- 935 return_X_y : bool, default=False 936 If True, returns ``(data, target)`` instead of a Bunch object. 937 See below for more information about the `data` and `target` object. 938 939 .. versionadded:: 0.18 940 941 as_frame : bool, default=False 942 If True, the data is a pandas DataFrame including columns with 943 appropriate dtypes (numeric). The target is 944 a pandas DataFrame or Series depending on the number of target columns. 945 If `return_X_y` is True, then (`data`, `target`) will be pandas 946 DataFrames or Series as described below. 947 948 .. versionadded:: 0.23 949 950 Returns 951 ------- 952 data : :class:`~sklearn.utils.Bunch` 953 Dictionary-like object, with the following attributes. 954 955 data : {ndarray, dataframe} of shape (442, 10) 956 The data matrix. If `as_frame=True`, `data` will be a pandas 957 DataFrame. 958 target: {ndarray, Series} of shape (442,) 959 The regression target. If `as_frame=True`, `target` will be 960 a pandas Series. 961 feature_names: list 962 The names of the dataset columns. 963 frame: DataFrame of shape (442, 11) 964 Only present when `as_frame=True`. DataFrame with `data` and 965 `target`. 966 967 .. versionadded:: 0.23 968 DESCR: str 969 The full description of the dataset. 970 data_filename: str 971 The path to the location of the data. 972 target_filename: str 973 The path to the location of the target. 974 975 (data, target) : tuple if ``return_X_y`` is True 976 Returns a tuple of two ndarray of shape (n_samples, n_features) 977 A 2D array with each row representing one sample and each column 978 representing the features and/or target of a given sample. 979 .. versionadded:: 0.18 980 """ 981 data_filename = "diabetes_data.csv.gz" 982 target_filename = "diabetes_target.csv.gz" 983 data = load_gzip_compressed_csv_data(data_filename) 984 target = load_gzip_compressed_csv_data(target_filename) 985 986 fdescr = load_descr("diabetes.rst") 987 988 feature_names = ["age", "sex", "bmi", "bp", "s1", "s2", "s3", "s4", "s5", "s6"] 989 990 frame = None 991 target_columns = [ 992 "target", 993 ] 994 if as_frame: 995 frame, data, target = _convert_data_dataframe( 996 "load_diabetes", data, target, feature_names, target_columns 997 ) 998 999 if return_X_y: 1000 return data, target 1001 1002 return Bunch( 1003 data=data, 1004 target=target, 1005 frame=frame, 1006 DESCR=fdescr, 1007 feature_names=feature_names, 1008 data_filename=data_filename, 1009 target_filename=target_filename, 1010 data_module=DATA_MODULE, 1011 ) 1012 1013 1014def load_linnerud(*, return_X_y=False, as_frame=False): 1015 """Load and return the physical exercise Linnerud dataset. 1016 1017 This dataset is suitable for multi-ouput regression tasks. 1018 1019 ============== ============================ 1020 Samples total 20 1021 Dimensionality 3 (for both data and target) 1022 Features integer 1023 Targets integer 1024 ============== ============================ 1025 1026 Read more in the :ref:`User Guide <linnerrud_dataset>`. 1027 1028 Parameters 1029 ---------- 1030 return_X_y : bool, default=False 1031 If True, returns ``(data, target)`` instead of a Bunch object. 1032 See below for more information about the `data` and `target` object. 1033 1034 .. versionadded:: 0.18 1035 1036 as_frame : bool, default=False 1037 If True, the data is a pandas DataFrame including columns with 1038 appropriate dtypes (numeric, string or categorical). The target is 1039 a pandas DataFrame or Series depending on the number of target columns. 1040 If `return_X_y` is True, then (`data`, `target`) will be pandas 1041 DataFrames or Series as described below. 1042 1043 .. versionadded:: 0.23 1044 1045 Returns 1046 ------- 1047 data : :class:`~sklearn.utils.Bunch` 1048 Dictionary-like object, with the following attributes. 1049 1050 data : {ndarray, dataframe} of shape (20, 3) 1051 The data matrix. If `as_frame=True`, `data` will be a pandas 1052 DataFrame. 1053 target: {ndarray, dataframe} of shape (20, 3) 1054 The regression targets. If `as_frame=True`, `target` will be 1055 a pandas DataFrame. 1056 feature_names: list 1057 The names of the dataset columns. 1058 target_names: list 1059 The names of the target columns. 1060 frame: DataFrame of shape (20, 6) 1061 Only present when `as_frame=True`. DataFrame with `data` and 1062 `target`. 1063 1064 .. versionadded:: 0.23 1065 DESCR: str 1066 The full description of the dataset. 1067 data_filename: str 1068 The path to the location of the data. 1069 target_filename: str 1070 The path to the location of the target. 1071 1072 .. versionadded:: 0.20 1073 1074 (data, target) : tuple if ``return_X_y`` is True 1075 1076 .. versionadded:: 0.18 1077 """ 1078 data_filename = "linnerud_exercise.csv" 1079 target_filename = "linnerud_physiological.csv" 1080 1081 # Read header and data 1082 with resources.open_text(DATA_MODULE, data_filename) as f: 1083 header_exercise = f.readline().split() 1084 f.seek(0) # reset file obj 1085 data_exercise = np.loadtxt(f, skiprows=1) 1086 1087 with resources.open_text(DATA_MODULE, target_filename) as f: 1088 header_physiological = f.readline().split() 1089 f.seek(0) # reset file obj 1090 data_physiological = np.loadtxt(f, skiprows=1) 1091 1092 fdescr = load_descr("linnerud.rst") 1093 1094 frame = None 1095 if as_frame: 1096 (frame, data_exercise, data_physiological) = _convert_data_dataframe( 1097 "load_linnerud", 1098 data_exercise, 1099 data_physiological, 1100 header_exercise, 1101 header_physiological, 1102 ) 1103 if return_X_y: 1104 return data_exercise, data_physiological 1105 1106 return Bunch( 1107 data=data_exercise, 1108 feature_names=header_exercise, 1109 target=data_physiological, 1110 target_names=header_physiological, 1111 frame=frame, 1112 DESCR=fdescr, 1113 data_filename=data_filename, 1114 target_filename=target_filename, 1115 data_module=DATA_MODULE, 1116 ) 1117 1118 1119@deprecated( 1120 r"""`load_boston` is deprecated in 1.0 and will be removed in 1.2. 1121 1122 The Boston housing prices dataset has an ethical problem. You can refer to 1123 the documentation of this function for further details. 1124 1125 The scikit-learn maintainers therefore strongly discourage the use of this 1126 dataset unless the purpose of the code is to study and educate about 1127 ethical issues in data science and machine learning. 1128 1129 In this special case, you can fetch the dataset from the original 1130 source:: 1131 1132 import pandas as pd 1133 import numpy as np 1134 1135 1136 data_url = "http://lib.stat.cmu.edu/datasets/boston" 1137 raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) 1138 data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) 1139 target = raw_df.values[1::2, 2] 1140 1141 Alternative datasets include the California housing dataset (i.e. 1142 :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing 1143 dataset. You can load the datasets as follows:: 1144 1145 from sklearn.datasets import fetch_california_housing 1146 housing = fetch_california_housing() 1147 1148 for the California housing dataset and:: 1149 1150 from sklearn.datasets import fetch_openml 1151 housing = fetch_openml(name="house_prices", as_frame=True) 1152 1153 for the Ames housing dataset. 1154 """ 1155) 1156def load_boston(*, return_X_y=False): 1157 r"""Load and return the boston house-prices dataset (regression). 1158 1159 ============== ============== 1160 Samples total 506 1161 Dimensionality 13 1162 Features real, positive 1163 Targets real 5. - 50. 1164 ============== ============== 1165 1166 Read more in the :ref:`User Guide <boston_dataset>`. 1167 1168 .. deprecated:: 1.0 1169 This function is deprecated in 1.0 and will be removed in 1.2. See the 1170 warning message below for further details regarding the alternative 1171 datasets. 1172 1173 .. warning:: 1174 The Boston housing prices dataset has an ethical problem: as 1175 investigated in [1]_, the authors of this dataset engineered a 1176 non-invertible variable "B" assuming that racial self-segregation had a 1177 positive impact on house prices [2]_. Furthermore the goal of the 1178 research that led to the creation of this dataset was to study the 1179 impact of air quality but it did not give adequate demonstration of the 1180 validity of this assumption. 1181 1182 The scikit-learn maintainers therefore strongly discourage the use of 1183 this dataset unless the purpose of the code is to study and educate 1184 about ethical issues in data science and machine learning. 1185 1186 In this special case, you can fetch the dataset from the original 1187 source:: 1188 1189 import pandas as pd # doctest: +SKIP 1190 import numpy as np 1191 1192 1193 data_url = "http://lib.stat.cmu.edu/datasets/boston" 1194 raw_df = pd.read_csv(data_url, sep="s+", skiprows=22, header=None) 1195 data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) 1196 target = raw_df.values[1::2, 2] 1197 1198 Alternative datasets include the California housing dataset [3]_ 1199 (i.e. :func:`~sklearn.datasets.fetch_california_housing`) and Ames 1200 housing dataset [4]_. You can load the datasets as follows:: 1201 1202 from sklearn.datasets import fetch_california_housing 1203 housing = fetch_california_housing() 1204 1205 for the California housing dataset and:: 1206 1207 from sklearn.datasets import fetch_openml 1208 housing = fetch_openml(name="house_prices", as_frame=True) # noqa 1209 1210 for the Ames housing dataset. 1211 1212 Parameters 1213 ---------- 1214 return_X_y : bool, default=False 1215 If True, returns ``(data, target)`` instead of a Bunch object. 1216 See below for more information about the `data` and `target` object. 1217 1218 .. versionadded:: 0.18 1219 1220 Returns 1221 ------- 1222 data : :class:`~sklearn.utils.Bunch` 1223 Dictionary-like object, with the following attributes. 1224 1225 data : ndarray of shape (506, 13) 1226 The data matrix. 1227 target : ndarray of shape (506,) 1228 The regression target. 1229 filename : str 1230 The physical location of boston csv dataset. 1231 1232 .. versionadded:: 0.20 1233 1234 DESCR : str 1235 The full description of the dataset. 1236 feature_names : ndarray 1237 The names of features 1238 1239 (data, target) : tuple if ``return_X_y`` is True 1240 1241 .. versionadded:: 0.18 1242 1243 Notes 1244 ----- 1245 .. versionchanged:: 0.20 1246 Fixed a wrong data point at [445, 0]. 1247 1248 References 1249 ---------- 1250 .. [1] `Racist data destruction? M Carlisle, 1251 <https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>`_ 1252 .. [2] `Harrison Jr, David, and Daniel L. Rubinfeld. 1253 "Hedonic housing prices and the demand for clean air." 1254 Journal of environmental economics and management 5.1 (1978): 81-102. 1255 <https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>`_ 1256 .. [3] `California housing dataset 1257 <https://scikit-learn.org/stable/datasets/real_world.html#california-housing-dataset>`_ 1258 .. [4] `Ames housing dataset 1259 <https://www.openml.org/d/42165>`_ 1260 1261 Examples 1262 -------- 1263 >>> import warnings 1264 >>> from sklearn.datasets import load_boston 1265 >>> with warnings.catch_warnings(): 1266 ... # You should probably not use this dataset. 1267 ... warnings.filterwarnings("ignore") 1268 ... X, y = load_boston(return_X_y=True) 1269 >>> print(X.shape) 1270 (506, 13) 1271 """ 1272 # TODO: once the deprecation period is over, implement a module level 1273 # `__getattr__` function in`sklearn.datasets` to raise an exception with 1274 # an informative error message at import time instead of just removing 1275 # load_boston. The goal is to avoid having beginners that copy-paste code 1276 # from numerous books and tutorials that use this dataset loader get 1277 # a confusing ImportError when trying to learn scikit-learn. 1278 # See: https://www.python.org/dev/peps/pep-0562/ 1279 1280 descr_text = load_descr("boston_house_prices.rst") 1281 1282 data_file_name = "boston_house_prices.csv" 1283 with resources.open_text(DATA_MODULE, data_file_name) as f: 1284 data_file = csv.reader(f) 1285 temp = next(data_file) 1286 n_samples = int(temp[0]) 1287 n_features = int(temp[1]) 1288 data = np.empty((n_samples, n_features)) 1289 target = np.empty((n_samples,)) 1290 temp = next(data_file) # names of features 1291 feature_names = np.array(temp) 1292 1293 for i, d in enumerate(data_file): 1294 data[i] = np.asarray(d[:-1], dtype=np.float64) 1295 target[i] = np.asarray(d[-1], dtype=np.float64) 1296 1297 if return_X_y: 1298 return data, target 1299 1300 return Bunch( 1301 data=data, 1302 target=target, 1303 # last column is target value 1304 feature_names=feature_names[:-1], 1305 DESCR=descr_text, 1306 filename=data_file_name, 1307 data_module=DATA_MODULE, 1308 ) 1309 1310 1311def load_sample_images(): 1312 """Load sample images for image manipulation. 1313 1314 Loads both, ``china`` and ``flower``. 1315 1316 Read more in the :ref:`User Guide <sample_images>`. 1317 1318 Returns 1319 ------- 1320 data : :class:`~sklearn.utils.Bunch` 1321 Dictionary-like object, with the following attributes. 1322 1323 images : list of ndarray of shape (427, 640, 3) 1324 The two sample image. 1325 filenames : list 1326 The filenames for the images. 1327 DESCR : str 1328 The full description of the dataset. 1329 1330 Examples 1331 -------- 1332 To load the data and visualize the images: 1333 1334 >>> from sklearn.datasets import load_sample_images 1335 >>> dataset = load_sample_images() #doctest: +SKIP 1336 >>> len(dataset.images) #doctest: +SKIP 1337 2 1338 >>> first_img_data = dataset.images[0] #doctest: +SKIP 1339 >>> first_img_data.shape #doctest: +SKIP 1340 (427, 640, 3) 1341 >>> first_img_data.dtype #doctest: +SKIP 1342 dtype('uint8') 1343 """ 1344 # import PIL only when needed 1345 from ..externals._pilutil import imread 1346 1347 descr = load_descr("README.txt", descr_module=IMAGES_MODULE) 1348 1349 filenames, images = [], [] 1350 for filename in sorted(resources.contents(IMAGES_MODULE)): 1351 if filename.endswith(".jpg"): 1352 filenames.append(filename) 1353 with resources.open_binary(IMAGES_MODULE, filename) as image_file: 1354 image = imread(image_file) 1355 images.append(image) 1356 1357 return Bunch(images=images, filenames=filenames, DESCR=descr) 1358 1359 1360def load_sample_image(image_name): 1361 """Load the numpy array of a single sample image 1362 1363 Read more in the :ref:`User Guide <sample_images>`. 1364 1365 Parameters 1366 ---------- 1367 image_name : {`china.jpg`, `flower.jpg`} 1368 The name of the sample image loaded 1369 1370 Returns 1371 ------- 1372 img : 3D array 1373 The image as a numpy array: height x width x color 1374 1375 Examples 1376 -------- 1377 1378 >>> from sklearn.datasets import load_sample_image 1379 >>> china = load_sample_image('china.jpg') # doctest: +SKIP 1380 >>> china.dtype # doctest: +SKIP 1381 dtype('uint8') 1382 >>> china.shape # doctest: +SKIP 1383 (427, 640, 3) 1384 >>> flower = load_sample_image('flower.jpg') # doctest: +SKIP 1385 >>> flower.dtype # doctest: +SKIP 1386 dtype('uint8') 1387 >>> flower.shape # doctest: +SKIP 1388 (427, 640, 3) 1389 """ 1390 images = load_sample_images() 1391 index = None 1392 for i, filename in enumerate(images.filenames): 1393 if filename.endswith(image_name): 1394 index = i 1395 break 1396 if index is None: 1397 raise AttributeError("Cannot find sample image: %s" % image_name) 1398 return images.images[index] 1399 1400 1401def _pkl_filepath(*args, **kwargs): 1402 """Return filename for Python 3 pickles 1403 1404 args[-1] is expected to be the ".pkl" filename. For compatibility with 1405 older scikit-learn versions, a suffix is inserted before the extension. 1406 1407 _pkl_filepath('/path/to/folder', 'filename.pkl') returns 1408 '/path/to/folder/filename_py3.pkl' 1409 1410 """ 1411 py3_suffix = kwargs.get("py3_suffix", "_py3") 1412 basename, ext = splitext(args[-1]) 1413 basename += py3_suffix 1414 new_args = args[:-1] + (basename + ext,) 1415 return join(*new_args) 1416 1417 1418def _sha256(path): 1419 """Calculate the sha256 hash of the file at path.""" 1420 sha256hash = hashlib.sha256() 1421 chunk_size = 8192 1422 with open(path, "rb") as f: 1423 while True: 1424 buffer = f.read(chunk_size) 1425 if not buffer: 1426 break 1427 sha256hash.update(buffer) 1428 return sha256hash.hexdigest() 1429 1430 1431def _fetch_remote(remote, dirname=None): 1432 """Helper function to download a remote dataset into path 1433 1434 Fetch a dataset pointed by remote's url, save into path using remote's 1435 filename and ensure its integrity based on the SHA256 Checksum of the 1436 downloaded file. 1437 1438 Parameters 1439 ---------- 1440 remote : RemoteFileMetadata 1441 Named tuple containing remote dataset meta information: url, filename 1442 and checksum 1443 1444 dirname : str 1445 Directory to save the file to. 1446 1447 Returns 1448 ------- 1449 file_path: str 1450 Full path of the created file. 1451 """ 1452 1453 file_path = remote.filename if dirname is None else join(dirname, remote.filename) 1454 urlretrieve(remote.url, file_path) 1455 checksum = _sha256(file_path) 1456 if remote.checksum != checksum: 1457 raise IOError( 1458 "{} has an SHA256 checksum ({}) " 1459 "differing from expected ({}), " 1460 "file may be corrupted.".format(file_path, checksum, remote.checksum) 1461 ) 1462 return file_path 1463