1# coding: utf-8 2# pylint: disable=too-many-arguments, too-many-branches, invalid-name 3# pylint: disable=too-many-lines, too-many-locals, no-self-use 4"""Core XGBoost Library.""" 5import collections 6# pylint: disable=no-name-in-module,import-error 7from collections.abc import Mapping 8from typing import List, Optional, Any, Union, Dict, TypeVar 9# pylint: enable=no-name-in-module,import-error 10from typing import Callable, Tuple, cast 11import ctypes 12import os 13import re 14import sys 15import json 16import warnings 17from functools import wraps 18from inspect import signature, Parameter 19 20import numpy as np 21import scipy.sparse 22 23from .compat import (STRING_TYPES, DataFrame, py_str, PANDAS_INSTALLED, 24 lazy_isinstance) 25from .libpath import find_lib_path 26 27# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h 28c_bst_ulong = ctypes.c_uint64 29 30 31class XGBoostError(ValueError): 32 """Error thrown by xgboost trainer.""" 33 34 35class EarlyStopException(Exception): 36 """Exception to signal early stopping. 37 38 Parameters 39 ---------- 40 best_iteration : int 41 The best iteration stopped. 42 """ 43 44 def __init__(self, best_iteration): 45 super().__init__() 46 self.best_iteration = best_iteration 47 48 49# Callback environment used by callbacks 50CallbackEnv = collections.namedtuple( 51 "XGBoostCallbackEnv", 52 ["model", 53 "cvfolds", 54 "iteration", 55 "begin_iteration", 56 "end_iteration", 57 "rank", 58 "evaluation_result_list"]) 59 60 61def from_pystr_to_cstr(data: Union[str, List[str]]): 62 """Convert a Python str or list of Python str to C pointer 63 64 Parameters 65 ---------- 66 data 67 str or list of str 68 """ 69 70 if isinstance(data, str): 71 return bytes(data, "utf-8") 72 if isinstance(data, list): 73 pointers = (ctypes.c_char_p * len(data))() 74 data = [bytes(d, 'utf-8') for d in data] 75 pointers[:] = data 76 return pointers 77 raise TypeError() 78 79 80def from_cstr_to_pystr(data, length) -> List[str]: 81 """Revert C pointer to Python str 82 83 Parameters 84 ---------- 85 data : ctypes pointer 86 pointer to data 87 length : ctypes pointer 88 pointer to length of data 89 """ 90 res = [] 91 for i in range(length.value): 92 try: 93 res.append(str(data[i].decode('ascii'))) 94 except UnicodeDecodeError: 95 res.append(str(data[i].decode('utf-8'))) 96 return res 97 98 99def _convert_ntree_limit( 100 booster: "Booster", 101 ntree_limit: Optional[int], 102 iteration_range: Optional[Tuple[int, int]] 103) -> Optional[Tuple[int, int]]: 104 if ntree_limit is not None and ntree_limit != 0: 105 warnings.warn( 106 "ntree_limit is deprecated, use `iteration_range` or model " 107 "slicing instead.", 108 UserWarning 109 ) 110 if iteration_range is not None and iteration_range[1] != 0: 111 raise ValueError( 112 "Only one of `iteration_range` and `ntree_limit` can be non zero." 113 ) 114 num_parallel_tree, _ = _get_booster_layer_trees(booster) 115 num_parallel_tree = max([num_parallel_tree, 1]) 116 iteration_range = (0, ntree_limit // num_parallel_tree) 117 return iteration_range 118 119 120def _expect(expectations, got): 121 """Translate input error into string. 122 123 Parameters 124 ---------- 125 expectations: sequence 126 a list of expected value. 127 got: 128 actual input 129 130 Returns 131 ------- 132 msg: str 133 """ 134 msg = 'Expecting ' 135 for t in range(len(expectations) - 1): 136 msg += str(expectations[t]) 137 msg += ' or ' 138 msg += str(expectations[-1]) 139 msg += '. Got ' + str(got) 140 return msg 141 142 143def _log_callback(msg: bytes) -> None: 144 """Redirect logs from native library into Python console""" 145 print(py_str(msg)) 146 147 148def _get_log_callback_func(): 149 """Wrap log_callback() method in ctypes callback type""" 150 # pylint: disable=invalid-name 151 CALLBACK = ctypes.CFUNCTYPE(None, ctypes.c_char_p) 152 return CALLBACK(_log_callback) 153 154 155def _load_lib(): 156 """Load xgboost Library.""" 157 lib_paths = find_lib_path() 158 if not lib_paths: 159 return None 160 try: 161 pathBackup = os.environ['PATH'].split(os.pathsep) 162 except KeyError: 163 pathBackup = [] 164 lib_success = False 165 os_error_list = [] 166 for lib_path in lib_paths: 167 try: 168 # needed when the lib is linked with non-system-available 169 # dependencies 170 os.environ['PATH'] = os.pathsep.join( 171 pathBackup + [os.path.dirname(lib_path)]) 172 lib = ctypes.cdll.LoadLibrary(lib_path) 173 lib_success = True 174 except OSError as e: 175 os_error_list.append(str(e)) 176 continue 177 finally: 178 os.environ['PATH'] = os.pathsep.join(pathBackup) 179 if not lib_success: 180 libname = os.path.basename(lib_paths[0]) 181 raise XGBoostError( 182 f""" 183XGBoost Library ({libname}) could not be loaded. 184Likely causes: 185 * OpenMP runtime is not installed 186 - vcomp140.dll or libgomp-1.dll for Windows 187 - libomp.dylib for Mac OSX 188 - libgomp.so for Linux and other UNIX-like OSes 189 Mac OSX users: Run `brew install libomp` to install OpenMP runtime. 190 191 * You are running 32-bit Python on a 64-bit OS 192 193Error message(s): {os_error_list} 194""") 195 lib.XGBGetLastError.restype = ctypes.c_char_p 196 lib.callback = _get_log_callback_func() 197 if lib.XGBRegisterLogCallback(lib.callback) != 0: 198 raise XGBoostError(lib.XGBGetLastError()) 199 return lib 200 201 202# load the XGBoost library globally 203_LIB = _load_lib() 204 205 206def _check_call(ret): 207 """Check the return value of C API call 208 209 This function will raise exception when error occurs. 210 Wrap every API call with this function 211 212 Parameters 213 ---------- 214 ret : int 215 return value from API calls 216 """ 217 if ret != 0: 218 raise XGBoostError(py_str(_LIB.XGBGetLastError())) 219 220 221def _numpy2ctypes_type(dtype): 222 _NUMPY_TO_CTYPES_MAPPING = { 223 np.float32: ctypes.c_float, 224 np.float64: ctypes.c_double, 225 np.uint32: ctypes.c_uint, 226 np.uint64: ctypes.c_uint64, 227 np.int32: ctypes.c_int32, 228 np.int64: ctypes.c_int64, 229 } 230 if np.intc is not np.int32: # Windows 231 _NUMPY_TO_CTYPES_MAPPING[np.intc] = _NUMPY_TO_CTYPES_MAPPING[np.int32] 232 if dtype not in _NUMPY_TO_CTYPES_MAPPING.keys(): 233 raise TypeError( 234 f"Supported types: {_NUMPY_TO_CTYPES_MAPPING.keys()}, got: {dtype}" 235 ) 236 return _NUMPY_TO_CTYPES_MAPPING[dtype] 237 238 239def _cuda_array_interface(data) -> bytes: 240 assert ( 241 data.dtype.hasobject is False 242 ), "Input data contains `object` dtype. Expecting numeric data." 243 interface = data.__cuda_array_interface__ 244 if "mask" in interface: 245 interface["mask"] = interface["mask"].__cuda_array_interface__ 246 interface_str = bytes(json.dumps(interface), "utf-8") 247 return interface_str 248 249 250def ctypes2numpy(cptr, length, dtype): 251 """Convert a ctypes pointer array to a numpy array.""" 252 ctype = _numpy2ctypes_type(dtype) 253 if not isinstance(cptr, ctypes.POINTER(ctype)): 254 raise RuntimeError(f"expected {ctype} pointer") 255 res = np.zeros(length, dtype=dtype) 256 if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]): 257 raise RuntimeError("memmove failed") 258 return res 259 260 261def ctypes2cupy(cptr, length, dtype): 262 """Convert a ctypes pointer array to a cupy array.""" 263 # pylint: disable=import-error 264 import cupy 265 from cupy.cuda.memory import MemoryPointer 266 from cupy.cuda.memory import UnownedMemory 267 268 CUPY_TO_CTYPES_MAPPING = {cupy.float32: ctypes.c_float, cupy.uint32: ctypes.c_uint} 269 if dtype not in CUPY_TO_CTYPES_MAPPING.keys(): 270 raise RuntimeError(f"Supported types: {CUPY_TO_CTYPES_MAPPING.keys()}") 271 addr = ctypes.cast(cptr, ctypes.c_void_p).value 272 # pylint: disable=c-extension-no-member,no-member 273 device = cupy.cuda.runtime.pointerGetAttributes(addr).device 274 # The owner field is just used to keep the memory alive with ref count. As 275 # unowned's life time is scoped within this function we don't need that. 276 unownd = UnownedMemory( 277 addr, length * ctypes.sizeof(CUPY_TO_CTYPES_MAPPING[dtype]), owner=None 278 ) 279 memptr = MemoryPointer(unownd, 0) 280 # pylint: disable=unexpected-keyword-arg 281 mem = cupy.ndarray((length,), dtype=dtype, memptr=memptr) 282 assert mem.device.id == device 283 arr = cupy.array(mem, copy=True) 284 return arr 285 286 287def ctypes2buffer(cptr, length): 288 """Convert ctypes pointer to buffer type.""" 289 if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)): 290 raise RuntimeError('expected char pointer') 291 res = bytearray(length) 292 rptr = (ctypes.c_char * length).from_buffer(res) 293 if not ctypes.memmove(rptr, cptr, length): 294 raise RuntimeError('memmove failed') 295 return res 296 297 298def c_str(string): 299 """Convert a python string to cstring.""" 300 return ctypes.c_char_p(string.encode('utf-8')) 301 302 303def c_array(ctype, values): 304 """Convert a python string to c array.""" 305 if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype): 306 return (ctype * len(values)).from_buffer_copy(values) 307 return (ctype * len(values))(*values) 308 309 310def _prediction_output(shape, dims, predts, is_cuda): 311 arr_shape: np.ndarray = ctypes2numpy(shape, dims.value, np.uint64) 312 length = int(np.prod(arr_shape)) 313 if is_cuda: 314 arr_predict = ctypes2cupy(predts, length, np.float32) 315 else: 316 arr_predict: np.ndarray = ctypes2numpy(predts, length, np.float32) 317 arr_predict = arr_predict.reshape(arr_shape) 318 return arr_predict 319 320 321class DataIter: # pylint: disable=too-many-instance-attributes 322 """The interface for user defined data iterator. 323 324 Parameters 325 ---------- 326 cache_prefix: 327 Prefix to the cache files, only used in external memory. It can be either an URI 328 or a file path. 329 330 """ 331 _T = TypeVar("_T") 332 333 def __init__(self, cache_prefix: Optional[str] = None) -> None: 334 self.cache_prefix = cache_prefix 335 336 self._handle = _ProxyDMatrix() 337 self._exception: Optional[Exception] = None 338 self._enable_categorical = False 339 self._allow_host = True 340 # Stage data in Python until reset or next is called to avoid data being free. 341 self._temporary_data = None 342 343 def _get_callbacks( 344 self, allow_host: bool, enable_categorical: bool 345 ) -> Tuple[Callable, Callable]: 346 assert hasattr(self, "cache_prefix"), "__init__ is not called." 347 self._reset_callback = ctypes.CFUNCTYPE(None, ctypes.c_void_p)( 348 self._reset_wrapper 349 ) 350 self._next_callback = ctypes.CFUNCTYPE( 351 ctypes.c_int, 352 ctypes.c_void_p, 353 )(self._next_wrapper) 354 self._allow_host = allow_host 355 self._enable_categorical = enable_categorical 356 return self._reset_callback, self._next_callback 357 358 @property 359 def proxy(self) -> "_ProxyDMatrix": 360 """Handle of DMatrix proxy.""" 361 return self._handle 362 363 def _handle_exception(self, fn: Callable, dft_ret: _T) -> _T: 364 if self._exception is not None: 365 return dft_ret 366 367 try: 368 return fn() 369 except Exception as e: # pylint: disable=broad-except 370 # Defer the exception in order to return 0 and stop the iteration. 371 # Exception inside a ctype callback function has no effect except 372 # for printing to stderr (doesn't stop the execution). 373 tb = sys.exc_info()[2] 374 # On dask, the worker is restarted and somehow the information is 375 # lost. 376 self._exception = e.with_traceback(tb) 377 return dft_ret 378 379 def _reraise(self) -> None: 380 self._temporary_data = None 381 if self._exception is not None: 382 # pylint 2.7.0 believes `self._exception` can be None even with `assert 383 # isinstace` 384 exc = self._exception 385 self._exception = None 386 raise exc # pylint: disable=raising-bad-type 387 388 def __del__(self) -> None: 389 assert self._temporary_data is None 390 assert self._exception is None 391 392 def _reset_wrapper(self, this: None) -> None: # pylint: disable=unused-argument 393 """A wrapper for user defined `reset` function.""" 394 # free the data 395 self._temporary_data = None 396 self._handle_exception(self.reset, None) 397 398 def _next_wrapper(self, this: None) -> int: # pylint: disable=unused-argument 399 """A wrapper for user defined `next` function. 400 401 `this` is not used in Python. ctypes can handle `self` of a Python 402 member function automatically when converting it to c function 403 pointer. 404 405 """ 406 @_deprecate_positional_args 407 def data_handle( 408 data: Any, 409 *, 410 feature_names: Optional[List[str]] = None, 411 feature_types: Optional[List[str]] = None, 412 **kwargs: Any, 413 ) -> None: 414 from .data import dispatch_proxy_set_data 415 from .data import _proxy_transform 416 417 new, cat_codes, feature_names, feature_types = _proxy_transform( 418 data, 419 feature_names, 420 feature_types, 421 self._enable_categorical, 422 ) 423 # Stage the data, meta info are copied inside C++ MetaInfo. 424 self._temporary_data = (new, cat_codes) 425 dispatch_proxy_set_data(self.proxy, new, cat_codes, self._allow_host) 426 self.proxy.set_info( 427 feature_names=feature_names, 428 feature_types=feature_types, 429 **kwargs, 430 ) 431 # pylint: disable=not-callable 432 return self._handle_exception(lambda: self.next(data_handle), 0) 433 434 def reset(self) -> None: 435 """Reset the data iterator. Prototype for user defined function.""" 436 raise NotImplementedError() 437 438 def next(self, input_data: Callable) -> int: 439 """Set the next batch of data. 440 441 Parameters 442 ---------- 443 444 data_handle: 445 A function with same data fields like `data`, `label` with 446 `xgboost.DMatrix`. 447 448 Returns 449 ------- 450 0 if there's no more batch, otherwise 1. 451 452 """ 453 raise NotImplementedError() 454 455 456# Notice for `_deprecate_positional_args` 457# Authors: Olivier Grisel 458# Gael Varoquaux 459# Andreas Mueller 460# Lars Buitinck 461# Alexandre Gramfort 462# Nicolas Tresegnie 463# Sylvain Marie 464# License: BSD 3 clause 465def _deprecate_positional_args(f): 466 """Decorator for methods that issues warnings for positional arguments 467 468 Using the keyword-only argument syntax in pep 3102, arguments after the 469 * will issue a warning when passed as a positional argument. 470 471 Modifed from sklearn utils.validation. 472 473 Parameters 474 ---------- 475 f : function 476 function to check arguments on 477 """ 478 sig = signature(f) 479 kwonly_args = [] 480 all_args = [] 481 482 for name, param in sig.parameters.items(): 483 if param.kind == Parameter.POSITIONAL_OR_KEYWORD: 484 all_args.append(name) 485 elif param.kind == Parameter.KEYWORD_ONLY: 486 kwonly_args.append(name) 487 488 @wraps(f) 489 def inner_f(*args, **kwargs): 490 extra_args = len(args) - len(all_args) 491 if extra_args > 0: 492 # ignore first 'self' argument for instance methods 493 args_msg = [ 494 f"{name}" for name, _ in zip( 495 kwonly_args[:extra_args], args[-extra_args:] 496 ) 497 ] 498 # pylint: disable=consider-using-f-string 499 warnings.warn( 500 "Pass `{}` as keyword args. Passing these as positional " 501 "arguments will be considered as error in future releases.". 502 format(", ".join(args_msg)), FutureWarning 503 ) 504 for k, arg in zip(sig.parameters, args): 505 kwargs[k] = arg 506 return f(**kwargs) 507 508 return inner_f 509 510 511class DMatrix: # pylint: disable=too-many-instance-attributes 512 """Data Matrix used in XGBoost. 513 514 DMatrix is an internal data structure that is used by XGBoost, 515 which is optimized for both memory efficiency and training speed. 516 You can construct DMatrix from multiple different sources of data. 517 """ 518 519 @_deprecate_positional_args 520 def __init__( 521 self, 522 data, 523 label=None, 524 *, 525 weight=None, 526 base_margin=None, 527 missing: Optional[float] = None, 528 silent=False, 529 feature_names: Optional[List[str]] = None, 530 feature_types: Optional[List[str]] = None, 531 nthread: Optional[int] = None, 532 group=None, 533 qid=None, 534 label_lower_bound=None, 535 label_upper_bound=None, 536 feature_weights=None, 537 enable_categorical: bool = False, 538 ) -> None: 539 """Parameters 540 ---------- 541 data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/ 542 dt.Frame/cudf.DataFrame/cupy.array/dlpack 543 Data source of DMatrix. 544 When data is string or os.PathLike type, it represents the path 545 libsvm format txt file, csv file (by specifying uri parameter 546 'path_to_csv?format=csv'), or binary file that xgboost can read 547 from. 548 label : array_like 549 Label of the training data. 550 weight : array_like 551 Weight for each instance. 552 553 .. note:: For ranking task, weights are per-group. 554 555 In ranking task, one weight is assigned to each group (not each 556 data point). This is because we only care about the relative 557 ordering of data points within each group, so it doesn't make 558 sense to assign weights to individual data points. 559 560 base_margin: array_like 561 Base margin used for boosting from existing model. 562 missing : float, optional 563 Value in the input data which needs to be present as a missing 564 value. If None, defaults to np.nan. 565 silent : boolean, optional 566 Whether print messages during construction 567 feature_names : list, optional 568 Set names for features. 569 feature_types : 570 571 Set types for features. When `enable_categorical` is set to `True`, string 572 "c" represents categorical data type. 573 574 nthread : integer, optional 575 Number of threads to use for loading data when parallelization is 576 applicable. If -1, uses maximum threads available on the system. 577 group : array_like 578 Group size for all ranking group. 579 qid : array_like 580 Query ID for data samples, used for ranking. 581 label_lower_bound : array_like 582 Lower bound for survival training. 583 label_upper_bound : array_like 584 Upper bound for survival training. 585 feature_weights : array_like, optional 586 Set feature weights for column sampling. 587 enable_categorical: boolean, optional 588 589 .. versionadded:: 1.3.0 590 591 Experimental support of specializing for categorical features. Do not set to 592 True unless you are interested in development. Currently it's only available 593 for `gpu_hist` tree method with 1 vs rest (one hot) categorical split. Also, 594 JSON serialization format is required. 595 596 """ 597 if group is not None and qid is not None: 598 raise ValueError("Either one of `group` or `qid` should be None.") 599 600 self.missing = missing if missing is not None else np.nan 601 self.nthread = nthread if nthread is not None else -1 602 self.silent = silent 603 604 # force into void_p, mac need to pass things in as void_p 605 if data is None: 606 self.handle = None 607 return 608 609 from .data import dispatch_data_backend, _is_iter 610 611 if _is_iter(data): 612 self._init_from_iter(data, enable_categorical) 613 assert self.handle is not None 614 return 615 616 handle, feature_names, feature_types = dispatch_data_backend( 617 data, 618 missing=self.missing, 619 threads=self.nthread, 620 feature_names=feature_names, 621 feature_types=feature_types, 622 enable_categorical=enable_categorical, 623 ) 624 assert handle is not None 625 self.handle = handle 626 627 self.set_info( 628 label=label, 629 weight=weight, 630 base_margin=base_margin, 631 group=group, 632 qid=qid, 633 label_lower_bound=label_lower_bound, 634 label_upper_bound=label_upper_bound, 635 feature_weights=feature_weights, 636 ) 637 638 if feature_names is not None: 639 self.feature_names = feature_names 640 if feature_types is not None: 641 self.feature_types = feature_types 642 643 def _init_from_iter(self, iterator: DataIter, enable_categorical: bool): 644 it = iterator 645 args = { 646 "missing": self.missing, 647 "nthread": self.nthread, 648 "cache_prefix": it.cache_prefix if it.cache_prefix else "", 649 } 650 args = from_pystr_to_cstr(json.dumps(args)) 651 handle = ctypes.c_void_p() 652 # pylint: disable=protected-access 653 reset_callback, next_callback = it._get_callbacks( 654 True, enable_categorical 655 ) 656 ret = _LIB.XGDMatrixCreateFromCallback( 657 None, 658 it.proxy.handle, 659 reset_callback, 660 next_callback, 661 args, 662 ctypes.byref(handle), 663 ) 664 # pylint: disable=protected-access 665 it._reraise() 666 # delay check_call to throw intermediate exception first 667 _check_call(ret) 668 self.handle = handle 669 670 def __del__(self): 671 if hasattr(self, "handle") and self.handle: 672 _check_call(_LIB.XGDMatrixFree(self.handle)) 673 self.handle = None 674 675 @_deprecate_positional_args 676 def set_info( 677 self, 678 *, 679 label=None, 680 weight=None, 681 base_margin=None, 682 group=None, 683 qid=None, 684 label_lower_bound=None, 685 label_upper_bound=None, 686 feature_names: Optional[List[str]] = None, 687 feature_types: Optional[List[str]] = None, 688 feature_weights=None 689 ) -> None: 690 """Set meta info for DMatrix. See doc string for :py:obj:`xgboost.DMatrix`.""" 691 from .data import dispatch_meta_backend 692 693 if label is not None: 694 self.set_label(label) 695 if weight is not None: 696 self.set_weight(weight) 697 if base_margin is not None: 698 self.set_base_margin(base_margin) 699 if group is not None: 700 self.set_group(group) 701 if qid is not None: 702 self.set_uint_info('qid', qid) 703 if label_lower_bound is not None: 704 self.set_float_info('label_lower_bound', label_lower_bound) 705 if label_upper_bound is not None: 706 self.set_float_info('label_upper_bound', label_upper_bound) 707 if feature_names is not None: 708 self.feature_names = feature_names 709 if feature_types is not None: 710 self.feature_types = feature_types 711 if feature_weights is not None: 712 dispatch_meta_backend(matrix=self, data=feature_weights, 713 name='feature_weights') 714 715 def get_float_info(self, field): 716 """Get float property from the DMatrix. 717 718 Parameters 719 ---------- 720 field: str 721 The field name of the information 722 723 Returns 724 ------- 725 info : array 726 a numpy array of float information of the data 727 """ 728 length = c_bst_ulong() 729 ret = ctypes.POINTER(ctypes.c_float)() 730 _check_call(_LIB.XGDMatrixGetFloatInfo(self.handle, 731 c_str(field), 732 ctypes.byref(length), 733 ctypes.byref(ret))) 734 return ctypes2numpy(ret, length.value, np.float32) 735 736 def get_uint_info(self, field): 737 """Get unsigned integer property from the DMatrix. 738 739 Parameters 740 ---------- 741 field: str 742 The field name of the information 743 744 Returns 745 ------- 746 info : array 747 a numpy array of unsigned integer information of the data 748 """ 749 length = c_bst_ulong() 750 ret = ctypes.POINTER(ctypes.c_uint)() 751 _check_call(_LIB.XGDMatrixGetUIntInfo(self.handle, 752 c_str(field), 753 ctypes.byref(length), 754 ctypes.byref(ret))) 755 return ctypes2numpy(ret, length.value, np.uint32) 756 757 def set_float_info(self, field, data): 758 """Set float type property into the DMatrix. 759 760 Parameters 761 ---------- 762 field: str 763 The field name of the information 764 765 data: numpy array 766 The array of data to be set 767 """ 768 from .data import dispatch_meta_backend 769 dispatch_meta_backend(self, data, field, 'float') 770 771 def set_float_info_npy2d(self, field, data): 772 """Set float type property into the DMatrix 773 for numpy 2d array input 774 775 Parameters 776 ---------- 777 field: str 778 The field name of the information 779 780 data: numpy array 781 The array of data to be set 782 """ 783 from .data import dispatch_meta_backend 784 dispatch_meta_backend(self, data, field, 'float') 785 786 def set_uint_info(self, field, data): 787 """Set uint type property into the DMatrix. 788 789 Parameters 790 ---------- 791 field: str 792 The field name of the information 793 794 data: numpy array 795 The array of data to be set 796 """ 797 from .data import dispatch_meta_backend 798 dispatch_meta_backend(self, data, field, 'uint32') 799 800 def save_binary(self, fname, silent=True): 801 """Save DMatrix to an XGBoost buffer. Saved binary can be later loaded 802 by providing the path to :py:func:`xgboost.DMatrix` as input. 803 804 Parameters 805 ---------- 806 fname : string or os.PathLike 807 Name of the output buffer file. 808 silent : bool (optional; default: True) 809 If set, the output is suppressed. 810 """ 811 fname = os.fspath(os.path.expanduser(fname)) 812 _check_call(_LIB.XGDMatrixSaveBinary(self.handle, 813 c_str(fname), 814 ctypes.c_int(silent))) 815 816 def set_label(self, label): 817 """Set label of dmatrix 818 819 Parameters 820 ---------- 821 label: array like 822 The label information to be set into DMatrix 823 """ 824 from .data import dispatch_meta_backend 825 dispatch_meta_backend(self, label, 'label', 'float') 826 827 def set_weight(self, weight): 828 """Set weight of each instance. 829 830 Parameters 831 ---------- 832 weight : array like 833 Weight for each data point 834 835 .. note:: For ranking task, weights are per-group. 836 837 In ranking task, one weight is assigned to each group (not each 838 data point). This is because we only care about the relative 839 ordering of data points within each group, so it doesn't make 840 sense to assign weights to individual data points. 841 842 """ 843 from .data import dispatch_meta_backend 844 dispatch_meta_backend(self, weight, 'weight', 'float') 845 846 def set_base_margin(self, margin): 847 """Set base margin of booster to start from. 848 849 This can be used to specify a prediction value of existing model to be 850 base_margin However, remember margin is needed, instead of transformed 851 prediction e.g. for logistic regression: need to put in value before 852 logistic transformation see also example/demo.py 853 854 Parameters 855 ---------- 856 margin: array like 857 Prediction margin of each datapoint 858 859 """ 860 from .data import dispatch_meta_backend 861 dispatch_meta_backend(self, margin, 'base_margin', 'float') 862 863 def set_group(self, group): 864 """Set group size of DMatrix (used for ranking). 865 866 Parameters 867 ---------- 868 group : array like 869 Group size of each group 870 """ 871 from .data import dispatch_meta_backend 872 dispatch_meta_backend(self, group, 'group', 'uint32') 873 874 def get_label(self): 875 """Get the label of the DMatrix. 876 877 Returns 878 ------- 879 label : array 880 """ 881 return self.get_float_info('label') 882 883 def get_weight(self): 884 """Get the weight of the DMatrix. 885 886 Returns 887 ------- 888 weight : array 889 """ 890 return self.get_float_info('weight') 891 892 def get_base_margin(self): 893 """Get the base margin of the DMatrix. 894 895 Returns 896 ------- 897 base_margin : float 898 """ 899 return self.get_float_info('base_margin') 900 901 def num_row(self): 902 """Get the number of rows in the DMatrix. 903 904 Returns 905 ------- 906 number of rows : int 907 """ 908 ret = c_bst_ulong() 909 _check_call(_LIB.XGDMatrixNumRow(self.handle, 910 ctypes.byref(ret))) 911 return ret.value 912 913 def num_col(self): 914 """Get the number of columns (features) in the DMatrix. 915 916 Returns 917 ------- 918 number of columns : int 919 """ 920 ret = c_bst_ulong() 921 _check_call(_LIB.XGDMatrixNumCol(self.handle, ctypes.byref(ret))) 922 return ret.value 923 924 def slice( 925 self, rindex: Union[List[int], np.ndarray], allow_groups: bool = False 926 ) -> "DMatrix": 927 """Slice the DMatrix and return a new DMatrix that only contains `rindex`. 928 929 Parameters 930 ---------- 931 rindex 932 List of indices to be selected. 933 allow_groups 934 Allow slicing of a matrix with a groups attribute 935 936 Returns 937 ------- 938 res 939 A new DMatrix containing only selected indices. 940 """ 941 from .data import _maybe_np_slice 942 943 res = DMatrix(None) 944 res.handle = ctypes.c_void_p() 945 rindex = _maybe_np_slice(rindex, dtype=np.int32) 946 _check_call( 947 _LIB.XGDMatrixSliceDMatrixEx( 948 self.handle, 949 c_array(ctypes.c_int, rindex), 950 c_bst_ulong(len(rindex)), 951 ctypes.byref(res.handle), 952 ctypes.c_int(1 if allow_groups else 0), 953 ) 954 ) 955 return res 956 957 @property 958 def feature_names(self) -> Optional[List[str]]: 959 """Get feature names (column labels). 960 961 Returns 962 ------- 963 feature_names : list or None 964 """ 965 length = c_bst_ulong() 966 sarr = ctypes.POINTER(ctypes.c_char_p)() 967 _check_call( 968 _LIB.XGDMatrixGetStrFeatureInfo( 969 self.handle, 970 c_str("feature_name"), 971 ctypes.byref(length), 972 ctypes.byref(sarr), 973 ) 974 ) 975 feature_names = from_cstr_to_pystr(sarr, length) 976 if not feature_names: 977 return None 978 return feature_names 979 980 @feature_names.setter 981 def feature_names(self, feature_names: Optional[Union[List[str], str]]) -> None: 982 """Set feature names (column labels). 983 984 Parameters 985 ---------- 986 feature_names : list or None 987 Labels for features. None will reset existing feature names 988 """ 989 if feature_names is not None: 990 # validate feature name 991 try: 992 if not isinstance(feature_names, str): 993 feature_names = list(feature_names) 994 else: 995 feature_names = [feature_names] 996 except TypeError: 997 feature_names = [feature_names] 998 999 if len(feature_names) != len(set(feature_names)): 1000 raise ValueError('feature_names must be unique') 1001 if len(feature_names) != self.num_col() and self.num_col() != 0: 1002 msg = ("feature_names must have the same length as data, ", 1003 f"expected {self.num_col()}, got {len(feature_names)}") 1004 raise ValueError(msg) 1005 # prohibit to use symbols may affect to parse. e.g. []< 1006 if not all(isinstance(f, str) and 1007 not any(x in f for x in set(('[', ']', '<'))) 1008 for f in feature_names): 1009 raise ValueError('feature_names must be string, and may not contain [, ] or <') 1010 c_feature_names = [bytes(f, encoding='utf-8') for f in feature_names] 1011 c_feature_names = (ctypes.c_char_p * 1012 len(c_feature_names))(*c_feature_names) 1013 _check_call(_LIB.XGDMatrixSetStrFeatureInfo( 1014 self.handle, c_str('feature_name'), 1015 c_feature_names, 1016 c_bst_ulong(len(feature_names)))) 1017 else: 1018 # reset feature_types also 1019 _check_call(_LIB.XGDMatrixSetStrFeatureInfo( 1020 self.handle, 1021 c_str('feature_name'), 1022 None, 1023 c_bst_ulong(0))) 1024 self.feature_types = None 1025 1026 @property 1027 def feature_types(self) -> Optional[List[str]]: 1028 """Get feature types (column types). 1029 1030 Returns 1031 ------- 1032 feature_types : list or None 1033 """ 1034 length = c_bst_ulong() 1035 sarr = ctypes.POINTER(ctypes.c_char_p)() 1036 _check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle, 1037 c_str('feature_type'), 1038 ctypes.byref(length), 1039 ctypes.byref(sarr))) 1040 res = from_cstr_to_pystr(sarr, length) 1041 if not res: 1042 return None 1043 return res 1044 1045 @feature_types.setter 1046 def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None: 1047 """Set feature types (column types). 1048 1049 This is for displaying the results and categorical data support. See doc string 1050 of :py:obj:`xgboost.DMatrix` for details. 1051 1052 Parameters 1053 ---------- 1054 feature_types : list or None 1055 Labels for features. None will reset existing feature names 1056 1057 """ 1058 # For compatibility reason this function wraps single str input into a list. But 1059 # we should not promote such usage since other than visualization, the field is 1060 # also used for specifying categorical data type. 1061 if feature_types is not None: 1062 if not isinstance(feature_types, (list, str)): 1063 raise TypeError( 1064 'feature_types must be string or list of strings') 1065 if isinstance(feature_types, str): 1066 # single string will be applied to all columns 1067 feature_types = [feature_types] * self.num_col() 1068 try: 1069 if not isinstance(feature_types, str): 1070 feature_types = list(feature_types) 1071 else: 1072 feature_types = [feature_types] 1073 except TypeError: 1074 feature_types = [feature_types] 1075 c_feature_types = [bytes(f, encoding='utf-8') 1076 for f in feature_types] 1077 c_feature_types = (ctypes.c_char_p * 1078 len(c_feature_types))(*c_feature_types) 1079 _check_call(_LIB.XGDMatrixSetStrFeatureInfo( 1080 self.handle, c_str('feature_type'), 1081 c_feature_types, 1082 c_bst_ulong(len(feature_types)))) 1083 1084 if len(feature_types) != self.num_col(): 1085 msg = 'feature_types must have the same length as data' 1086 raise ValueError(msg) 1087 else: 1088 # Reset. 1089 _check_call(_LIB.XGDMatrixSetStrFeatureInfo( 1090 self.handle, 1091 c_str('feature_type'), 1092 None, 1093 c_bst_ulong(0))) 1094 1095 1096class _ProxyDMatrix(DMatrix): 1097 """A placeholder class when DMatrix cannot be constructed (DeviceQuantileDMatrix, 1098 inplace_predict). 1099 1100 """ 1101 1102 def __init__(self): # pylint: disable=super-init-not-called 1103 self.handle = ctypes.c_void_p() 1104 _check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle))) 1105 1106 def _set_data_from_cuda_interface(self, data) -> None: 1107 """Set data from CUDA array interface.""" 1108 interface = data.__cuda_array_interface__ 1109 interface_str = bytes(json.dumps(interface, indent=2), "utf-8") 1110 _check_call( 1111 _LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str) 1112 ) 1113 1114 def _set_data_from_cuda_columnar(self, data, cat_codes: list) -> None: 1115 """Set data from CUDA columnar format.""" 1116 from .data import _cudf_array_interfaces 1117 1118 interfaces_str = _cudf_array_interfaces(data, cat_codes) 1119 _check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str)) 1120 1121 def _set_data_from_array(self, data: np.ndarray): 1122 """Set data from numpy array.""" 1123 from .data import _array_interface 1124 1125 _check_call( 1126 _LIB.XGProxyDMatrixSetDataDense(self.handle, _array_interface(data)) 1127 ) 1128 1129 def _set_data_from_csr(self, csr): 1130 """Set data from scipy csr""" 1131 from .data import _array_interface 1132 1133 _LIB.XGProxyDMatrixSetDataCSR( 1134 self.handle, 1135 _array_interface(csr.indptr), 1136 _array_interface(csr.indices), 1137 _array_interface(csr.data), 1138 ctypes.c_size_t(csr.shape[1]), 1139 ) 1140 1141 1142class DeviceQuantileDMatrix(DMatrix): 1143 """Device memory Data Matrix used in XGBoost for training with tree_method='gpu_hist'. Do 1144 not use this for test/validation tasks as some information may be lost in 1145 quantisation. This DMatrix is primarily designed to save memory in training from 1146 device memory inputs by avoiding intermediate storage. Set max_bin to control the 1147 number of bins during quantisation. See doc string in :py:obj:`xgboost.DMatrix` for 1148 documents on meta info. 1149 1150 You can construct DeviceQuantileDMatrix from cupy/cudf/dlpack. 1151 1152 .. versionadded:: 1.1.0 1153 1154 """ 1155 1156 @_deprecate_positional_args 1157 def __init__( # pylint: disable=super-init-not-called 1158 self, 1159 data, 1160 label=None, 1161 *, 1162 weight=None, 1163 base_margin=None, 1164 missing=None, 1165 silent=False, 1166 feature_names=None, 1167 feature_types=None, 1168 nthread: Optional[int] = None, 1169 max_bin: int = 256, 1170 group=None, 1171 qid=None, 1172 label_lower_bound=None, 1173 label_upper_bound=None, 1174 feature_weights=None, 1175 enable_categorical: bool = False, 1176 ): 1177 self.max_bin = max_bin 1178 self.missing = missing if missing is not None else np.nan 1179 self.nthread = nthread if nthread is not None else 1 1180 self._silent = silent # unused, kept for compatibility 1181 1182 if isinstance(data, ctypes.c_void_p): 1183 self.handle = data 1184 return 1185 1186 if qid is not None and group is not None: 1187 raise ValueError( 1188 'Only one of the eval_qid or eval_group for each evaluation ' 1189 'dataset should be provided.' 1190 ) 1191 1192 self._init( 1193 data, 1194 label=label, 1195 weight=weight, 1196 base_margin=base_margin, 1197 group=group, 1198 qid=qid, 1199 label_lower_bound=label_lower_bound, 1200 label_upper_bound=label_upper_bound, 1201 feature_weights=feature_weights, 1202 feature_names=feature_names, 1203 feature_types=feature_types, 1204 enable_categorical=enable_categorical, 1205 ) 1206 1207 def _init(self, data, enable_categorical, **meta): 1208 from .data import ( 1209 _is_dlpack, 1210 _transform_dlpack, 1211 _is_iter, 1212 SingleBatchInternalIter, 1213 ) 1214 1215 if _is_dlpack(data): 1216 # We specialize for dlpack because cupy will take the memory from it so 1217 # it can't be transformed twice. 1218 data = _transform_dlpack(data) 1219 if _is_iter(data): 1220 it = data 1221 else: 1222 it = SingleBatchInternalIter(data=data, **meta) 1223 1224 handle = ctypes.c_void_p() 1225 # pylint: disable=protected-access 1226 reset_callback, next_callback = it._get_callbacks(False, enable_categorical) 1227 if it.cache_prefix is not None: 1228 raise ValueError( 1229 "DeviceQuantileDMatrix doesn't cache data, remove the cache_prefix " 1230 "in iterator to fix this error." 1231 ) 1232 ret = _LIB.XGDeviceQuantileDMatrixCreateFromCallback( 1233 None, 1234 it.proxy.handle, 1235 reset_callback, 1236 next_callback, 1237 ctypes.c_float(self.missing), 1238 ctypes.c_int(self.nthread), 1239 ctypes.c_int(self.max_bin), 1240 ctypes.byref(handle), 1241 ) 1242 # pylint: disable=protected-access 1243 it._reraise() 1244 # delay check_call to throw intermediate exception first 1245 _check_call(ret) 1246 self.handle = handle 1247 1248 1249Objective = Callable[[np.ndarray, DMatrix], Tuple[np.ndarray, np.ndarray]] 1250Metric = Callable[[np.ndarray, DMatrix], Tuple[str, float]] 1251 1252 1253def _get_booster_layer_trees(model: "Booster") -> Tuple[int, int]: 1254 """Get number of trees added to booster per-iteration. This function will be removed 1255 once `best_ntree_limit` is dropped in favor of `best_iteration`. Returns 1256 `num_parallel_tree` and `num_groups`. 1257 1258 """ 1259 config = json.loads(model.save_config()) 1260 booster = config["learner"]["gradient_booster"]["name"] 1261 if booster == "gblinear": 1262 num_parallel_tree = 0 1263 elif booster == "dart": 1264 num_parallel_tree = int( 1265 config["learner"]["gradient_booster"]["gbtree"]["gbtree_train_param"][ 1266 "num_parallel_tree" 1267 ] 1268 ) 1269 elif booster == "gbtree": 1270 num_parallel_tree = int( 1271 config["learner"]["gradient_booster"]["gbtree_train_param"][ 1272 "num_parallel_tree" 1273 ] 1274 ) 1275 else: 1276 raise ValueError(f"Unknown booster: {booster}") 1277 num_groups = int(config["learner"]["learner_model_param"]["num_class"]) 1278 return num_parallel_tree, num_groups 1279 1280 1281class Booster(object): 1282 # pylint: disable=too-many-public-methods 1283 """A Booster of XGBoost. 1284 1285 Booster is the model of xgboost, that contains low level routines for 1286 training, prediction and evaluation. 1287 """ 1288 1289 def __init__(self, params=None, cache=(), model_file=None): 1290 # pylint: disable=invalid-name 1291 """ 1292 Parameters 1293 ---------- 1294 params : dict 1295 Parameters for boosters. 1296 cache : list 1297 List of cache items. 1298 model_file : string/os.PathLike/Booster/bytearray 1299 Path to the model file if it's string or PathLike. 1300 """ 1301 for d in cache: 1302 if not isinstance(d, DMatrix): 1303 raise TypeError(f'invalid cache item: {type(d).__name__}', cache) 1304 1305 dmats = c_array(ctypes.c_void_p, [d.handle for d in cache]) 1306 self.handle = ctypes.c_void_p() 1307 _check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)), 1308 ctypes.byref(self.handle))) 1309 for d in cache: 1310 # Validate feature only after the feature names are saved into booster. 1311 self._validate_features(d) 1312 1313 if isinstance(model_file, Booster): 1314 assert self.handle is not None 1315 # We use the pickle interface for getting memory snapshot from 1316 # another model, and load the snapshot with this booster. 1317 state = model_file.__getstate__() 1318 handle = state['handle'] 1319 del state['handle'] 1320 ptr = (ctypes.c_char * len(handle)).from_buffer(handle) 1321 length = c_bst_ulong(len(handle)) 1322 _check_call( 1323 _LIB.XGBoosterUnserializeFromBuffer(self.handle, ptr, length)) 1324 self.__dict__.update(state) 1325 elif isinstance(model_file, (STRING_TYPES, os.PathLike, bytearray)): 1326 self.load_model(model_file) 1327 elif model_file is None: 1328 pass 1329 else: 1330 raise TypeError('Unknown type:', model_file) 1331 1332 params = params or {} 1333 params = self._configure_metrics(params.copy()) 1334 params = self._configure_constraints(params) 1335 if isinstance(params, list): 1336 params.append(('validate_parameters', True)) 1337 else: 1338 params['validate_parameters'] = True 1339 1340 self.set_param(params or {}) 1341 if (params is not None) and ('booster' in params): 1342 self.booster = params['booster'] 1343 else: 1344 self.booster = 'gbtree' 1345 1346 def _configure_metrics(self, params: Union[Dict, List]) -> Union[Dict, List]: 1347 if isinstance(params, dict) and 'eval_metric' in params \ 1348 and isinstance(params['eval_metric'], list): 1349 params = dict((k, v) for k, v in params.items()) 1350 eval_metrics = params['eval_metric'] 1351 params.pop("eval_metric", None) 1352 params = list(params.items()) 1353 for eval_metric in eval_metrics: 1354 params += [('eval_metric', eval_metric)] 1355 return params 1356 1357 def _transform_monotone_constrains(self, value: Union[Dict[str, int], str]) -> str: 1358 if isinstance(value, str): 1359 return value 1360 1361 constrained_features = set(value.keys()) 1362 if not constrained_features.issubset(set(self.feature_names or [])): 1363 raise ValueError('Constrained features are not a subset of ' 1364 'training data feature names') 1365 1366 return '(' + ','.join([str(value.get(feature_name, 0)) 1367 for feature_name in self.feature_names]) + ')' 1368 1369 def _transform_interaction_constraints( 1370 self, value: Union[List[Tuple[str]], str] 1371 ) -> str: 1372 if isinstance(value, str): 1373 return value 1374 1375 feature_idx_mapping = {k: str(v) for v, k in enumerate(self.feature_names or [])} 1376 1377 try: 1378 s = "[" 1379 for constraint in value: 1380 s += ( 1381 "[" 1382 + ",".join( 1383 [feature_idx_mapping[feature_name] for feature_name in constraint] 1384 ) 1385 + "]" 1386 ) 1387 return s + "]" 1388 except KeyError as e: 1389 # pylint: disable=raise-missing-from 1390 raise ValueError( 1391 "Constrained features are not a subset of training data feature names" 1392 ) from e 1393 1394 def _configure_constraints(self, params: Union[Dict, List]) -> Union[Dict, List]: 1395 if isinstance(params, dict): 1396 value = params.get("monotone_constraints") 1397 if value: 1398 params[ 1399 "monotone_constraints" 1400 ] = self._transform_monotone_constrains(value) 1401 1402 value = params.get("interaction_constraints") 1403 if value: 1404 params[ 1405 "interaction_constraints" 1406 ] = self._transform_interaction_constraints(value) 1407 1408 elif isinstance(params, list): 1409 for idx, param in enumerate(params): 1410 name, value = param 1411 if not value: 1412 continue 1413 1414 if name == "monotone_constraints": 1415 params[idx] = (name, self._transform_monotone_constrains(value)) 1416 elif name == "interaction_constraints": 1417 params[idx] = (name, self._transform_interaction_constraints(value)) 1418 1419 return params 1420 1421 def __del__(self): 1422 if hasattr(self, 'handle') and self.handle is not None: 1423 _check_call(_LIB.XGBoosterFree(self.handle)) 1424 self.handle = None 1425 1426 def __getstate__(self): 1427 # can't pickle ctypes pointers, put model content in bytearray 1428 this = self.__dict__.copy() 1429 handle = this['handle'] 1430 if handle is not None: 1431 length = c_bst_ulong() 1432 cptr = ctypes.POINTER(ctypes.c_char)() 1433 _check_call(_LIB.XGBoosterSerializeToBuffer(self.handle, 1434 ctypes.byref(length), 1435 ctypes.byref(cptr))) 1436 buf = ctypes2buffer(cptr, length.value) 1437 this["handle"] = buf 1438 return this 1439 1440 def __setstate__(self, state): 1441 # reconstruct handle from raw data 1442 handle = state['handle'] 1443 if handle is not None: 1444 buf = handle 1445 dmats = c_array(ctypes.c_void_p, []) 1446 handle = ctypes.c_void_p() 1447 _check_call(_LIB.XGBoosterCreate( 1448 dmats, c_bst_ulong(0), ctypes.byref(handle))) 1449 length = c_bst_ulong(len(buf)) 1450 ptr = (ctypes.c_char * len(buf)).from_buffer(buf) 1451 _check_call( 1452 _LIB.XGBoosterUnserializeFromBuffer(handle, ptr, length)) 1453 state['handle'] = handle 1454 self.__dict__.update(state) 1455 1456 def __getitem__(self, val): 1457 if isinstance(val, int): 1458 val = slice(val, val+1) 1459 if isinstance(val, tuple): 1460 raise ValueError('Only supports slicing through 1 dimension.') 1461 if not isinstance(val, slice): 1462 msg = _expect((int, slice), type(val)) 1463 raise TypeError(msg) 1464 if isinstance(val.start, type(Ellipsis)) or val.start is None: 1465 start = 0 1466 else: 1467 start = val.start 1468 if isinstance(val.stop, type(Ellipsis)) or val.stop is None: 1469 stop = 0 1470 else: 1471 stop = val.stop 1472 if stop < start: 1473 raise ValueError('Invalid slice', val) 1474 1475 step = val.step if val.step is not None else 1 1476 1477 start = ctypes.c_int(start) 1478 stop = ctypes.c_int(stop) 1479 step = ctypes.c_int(step) 1480 1481 sliced_handle = ctypes.c_void_p() 1482 status = _LIB.XGBoosterSlice(self.handle, start, stop, step, 1483 ctypes.byref(sliced_handle)) 1484 if status == -2: 1485 raise IndexError('Layer index out of range') 1486 _check_call(status) 1487 1488 sliced = Booster() 1489 _check_call(_LIB.XGBoosterFree(sliced.handle)) 1490 sliced.handle = sliced_handle 1491 return sliced 1492 1493 def save_config(self): 1494 '''Output internal parameter configuration of Booster as a JSON 1495 string. 1496 1497 .. versionadded:: 1.0.0 1498 ''' 1499 json_string = ctypes.c_char_p() 1500 length = c_bst_ulong() 1501 _check_call(_LIB.XGBoosterSaveJsonConfig( 1502 self.handle, 1503 ctypes.byref(length), 1504 ctypes.byref(json_string))) 1505 json_string = json_string.value.decode() # pylint: disable=no-member 1506 return json_string 1507 1508 def load_config(self, config): 1509 '''Load configuration returned by `save_config`. 1510 1511 .. versionadded:: 1.0.0 1512 ''' 1513 assert isinstance(config, str) 1514 _check_call(_LIB.XGBoosterLoadJsonConfig( 1515 self.handle, 1516 c_str(config))) 1517 1518 def __copy__(self): 1519 return self.__deepcopy__(None) 1520 1521 def __deepcopy__(self, _): 1522 '''Return a copy of booster.''' 1523 return Booster(model_file=self) 1524 1525 def copy(self): 1526 """Copy the booster object. 1527 1528 Returns 1529 ------- 1530 booster: `Booster` 1531 a copied booster model 1532 """ 1533 return self.__copy__() 1534 1535 def attr(self, key): 1536 """Get attribute string from the Booster. 1537 1538 Parameters 1539 ---------- 1540 key : str 1541 The key to get attribute from. 1542 1543 Returns 1544 ------- 1545 value : str 1546 The attribute value of the key, returns None if attribute do not exist. 1547 """ 1548 ret = ctypes.c_char_p() 1549 success = ctypes.c_int() 1550 _check_call(_LIB.XGBoosterGetAttr( 1551 self.handle, c_str(key), ctypes.byref(ret), ctypes.byref(success))) 1552 if success.value != 0: 1553 return py_str(ret.value) 1554 return None 1555 1556 def attributes(self): 1557 """Get attributes stored in the Booster as a dictionary. 1558 1559 Returns 1560 ------- 1561 result : dictionary of attribute_name: attribute_value pairs of strings. 1562 Returns an empty dict if there's no attributes. 1563 """ 1564 length = c_bst_ulong() 1565 sarr = ctypes.POINTER(ctypes.c_char_p)() 1566 _check_call(_LIB.XGBoosterGetAttrNames(self.handle, 1567 ctypes.byref(length), 1568 ctypes.byref(sarr))) 1569 attr_names = from_cstr_to_pystr(sarr, length) 1570 return {n: self.attr(n) for n in attr_names} 1571 1572 def set_attr(self, **kwargs: Optional[str]) -> None: 1573 """Set the attribute of the Booster. 1574 1575 Parameters 1576 ---------- 1577 **kwargs 1578 The attributes to set. Setting a value to None deletes an attribute. 1579 """ 1580 for key, value in kwargs.items(): 1581 if value is not None: 1582 if not isinstance(value, STRING_TYPES): 1583 raise ValueError("Set Attr only accepts string values") 1584 value = c_str(str(value)) 1585 _check_call(_LIB.XGBoosterSetAttr( 1586 self.handle, c_str(key), value)) 1587 1588 def _get_feature_info(self, field: str): 1589 length = c_bst_ulong() 1590 sarr = ctypes.POINTER(ctypes.c_char_p)() 1591 if not hasattr(self, "handle") or self.handle is None: 1592 return None 1593 _check_call( 1594 _LIB.XGBoosterGetStrFeatureInfo( 1595 self.handle, c_str(field), ctypes.byref(length), ctypes.byref(sarr), 1596 ) 1597 ) 1598 feature_info = from_cstr_to_pystr(sarr, length) 1599 return feature_info if feature_info else None 1600 1601 @property 1602 def feature_types(self) -> Optional[List[str]]: 1603 """Feature types for this booster. Can be directly set by input data or by 1604 assignment. 1605 1606 """ 1607 return self._get_feature_info("feature_type") 1608 1609 @property 1610 def feature_names(self) -> Optional[List[str]]: 1611 """Feature names for this booster. Can be directly set by input data or by 1612 assignment. 1613 1614 """ 1615 return self._get_feature_info("feature_name") 1616 1617 def _set_feature_info(self, features: Optional[List[str]], field: str) -> None: 1618 if features is not None: 1619 assert isinstance(features, list) 1620 c_feature_info = [bytes(f, encoding="utf-8") for f in features] 1621 c_feature_info = (ctypes.c_char_p * len(c_feature_info))(*c_feature_info) 1622 _check_call( 1623 _LIB.XGBoosterSetStrFeatureInfo( 1624 self.handle, c_str(field), c_feature_info, c_bst_ulong(len(features)) 1625 ) 1626 ) 1627 else: 1628 _check_call( 1629 _LIB.XGBoosterSetStrFeatureInfo( 1630 self.handle, c_str(field), None, c_bst_ulong(0) 1631 ) 1632 ) 1633 1634 @feature_names.setter 1635 def feature_names(self, features: Optional[List[str]]) -> None: 1636 self._set_feature_info(features, "feature_name") 1637 1638 @feature_types.setter 1639 def feature_types(self, features: Optional[List[str]]) -> None: 1640 self._set_feature_info(features, "feature_type") 1641 1642 def set_param(self, params, value=None): 1643 """Set parameters into the Booster. 1644 1645 Parameters 1646 ---------- 1647 params: dict/list/str 1648 list of key,value pairs, dict of key to value or simply str key 1649 value: optional 1650 value of the specified parameter, when params is str key 1651 """ 1652 if isinstance(params, Mapping): 1653 params = params.items() 1654 elif isinstance(params, STRING_TYPES) and value is not None: 1655 params = [(params, value)] 1656 for key, val in params: 1657 if val is not None: 1658 _check_call(_LIB.XGBoosterSetParam(self.handle, c_str(key), 1659 c_str(str(val)))) 1660 1661 def update(self, dtrain, iteration, fobj=None): 1662 """Update for one iteration, with objective function calculated 1663 internally. This function should not be called directly by users. 1664 1665 Parameters 1666 ---------- 1667 dtrain : DMatrix 1668 Training data. 1669 iteration : int 1670 Current iteration number. 1671 fobj : function 1672 Customized objective function. 1673 1674 """ 1675 if not isinstance(dtrain, DMatrix): 1676 raise TypeError(f"invalid training matrix: {type(dtrain).__name__}") 1677 self._validate_features(dtrain) 1678 1679 if fobj is None: 1680 _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, 1681 ctypes.c_int(iteration), 1682 dtrain.handle)) 1683 else: 1684 pred = self.predict(dtrain, output_margin=True, training=True) 1685 grad, hess = fobj(pred, dtrain) 1686 self.boost(dtrain, grad, hess) 1687 1688 def boost(self, dtrain, grad, hess): 1689 """Boost the booster for one iteration, with customized gradient 1690 statistics. Like :py:func:`xgboost.Booster.update`, this 1691 function should not be called directly by users. 1692 1693 Parameters 1694 ---------- 1695 dtrain : DMatrix 1696 The training DMatrix. 1697 grad : list 1698 The first order of gradient. 1699 hess : list 1700 The second order of gradient. 1701 1702 """ 1703 if len(grad) != len(hess): 1704 raise ValueError( 1705 f"grad / hess length mismatch: {len(grad)} / {len(hess)}" 1706 ) 1707 if not isinstance(dtrain, DMatrix): 1708 raise TypeError(f"invalid training matrix: {type(dtrain).__name__}") 1709 self._validate_features(dtrain) 1710 1711 _check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle, 1712 c_array(ctypes.c_float, grad), 1713 c_array(ctypes.c_float, hess), 1714 c_bst_ulong(len(grad)))) 1715 1716 def eval_set(self, evals, iteration=0, feval=None): 1717 # pylint: disable=invalid-name 1718 """Evaluate a set of data. 1719 1720 Parameters 1721 ---------- 1722 evals : list of tuples (DMatrix, string) 1723 List of items to be evaluated. 1724 iteration : int 1725 Current iteration. 1726 feval : function 1727 Custom evaluation function. 1728 1729 Returns 1730 ------- 1731 result: str 1732 Evaluation result string. 1733 """ 1734 for d in evals: 1735 if not isinstance(d[0], DMatrix): 1736 raise TypeError(f"expected DMatrix, got {type(d[0]).__name__}") 1737 if not isinstance(d[1], STRING_TYPES): 1738 raise TypeError(f"expected string, got {type(d[1]).__name__}") 1739 self._validate_features(d[0]) 1740 1741 dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals]) 1742 evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals]) 1743 msg = ctypes.c_char_p() 1744 _check_call(_LIB.XGBoosterEvalOneIter(self.handle, 1745 ctypes.c_int(iteration), 1746 dmats, evnames, 1747 c_bst_ulong(len(evals)), 1748 ctypes.byref(msg))) 1749 res = msg.value.decode() # pylint: disable=no-member 1750 if feval is not None: 1751 for dmat, evname in evals: 1752 feval_ret = feval(self.predict(dmat, training=False, 1753 output_margin=True), dmat) 1754 if isinstance(feval_ret, list): 1755 for name, val in feval_ret: 1756 # pylint: disable=consider-using-f-string 1757 res += '\t%s-%s:%f' % (evname, name, val) 1758 else: 1759 name, val = feval_ret 1760 # pylint: disable=consider-using-f-string 1761 res += '\t%s-%s:%f' % (evname, name, val) 1762 return res 1763 1764 def eval(self, data, name='eval', iteration=0): 1765 """Evaluate the model on mat. 1766 1767 Parameters 1768 ---------- 1769 data : DMatrix 1770 The dmatrix storing the input. 1771 1772 name : str, optional 1773 The name of the dataset. 1774 1775 iteration : int, optional 1776 The current iteration number. 1777 1778 Returns 1779 ------- 1780 result: str 1781 Evaluation result string. 1782 """ 1783 self._validate_features(data) 1784 return self.eval_set([(data, name)], iteration) 1785 1786 # pylint: disable=too-many-function-args 1787 def predict( 1788 self, 1789 data: DMatrix, 1790 output_margin: bool = False, 1791 ntree_limit: int = 0, 1792 pred_leaf: bool = False, 1793 pred_contribs: bool = False, 1794 approx_contribs: bool = False, 1795 pred_interactions: bool = False, 1796 validate_features: bool = True, 1797 training: bool = False, 1798 iteration_range: Tuple[int, int] = (0, 0), 1799 strict_shape: bool = False, 1800 ) -> np.ndarray: 1801 """Predict with data. The full model will be used unless `iteration_range` is specified, 1802 meaning user have to either slice the model or use the ``best_iteration`` 1803 attribute to get prediction from best model returned from early stopping. 1804 1805 .. note:: 1806 1807 See `Prediction 1808 <https://xgboost.readthedocs.io/en/latest/prediction.html>`_ 1809 for issues like thread safety and a summary of outputs from this function. 1810 1811 Parameters 1812 ---------- 1813 data : 1814 The dmatrix storing the input. 1815 1816 output_margin : 1817 Whether to output the raw untransformed margin value. 1818 1819 ntree_limit : 1820 Deprecated, use `iteration_range` instead. 1821 1822 pred_leaf : 1823 When this option is on, the output will be a matrix of (nsample, 1824 ntrees) with each record indicating the predicted leaf index of 1825 each sample in each tree. Note that the leaf index of a tree is 1826 unique per tree, so you may find leaf 1 in both tree 1 and tree 0. 1827 1828 pred_contribs : 1829 When this is True the output will be a matrix of size (nsample, 1830 nfeats + 1) with each record indicating the feature contributions 1831 (SHAP values) for that prediction. The sum of all feature 1832 contributions is equal to the raw untransformed margin value of the 1833 prediction. Note the final column is the bias term. 1834 1835 approx_contribs : 1836 Approximate the contributions of each feature. Used when ``pred_contribs`` or 1837 ``pred_interactions`` is set to True. Changing the default of this parameter 1838 (False) is not recommended. 1839 1840 pred_interactions : 1841 When this is True the output will be a matrix of size (nsample, 1842 nfeats + 1, nfeats + 1) indicating the SHAP interaction values for 1843 each pair of features. The sum of each row (or column) of the 1844 interaction values equals the corresponding SHAP value (from 1845 pred_contribs), and the sum of the entire matrix equals the raw 1846 untransformed margin value of the prediction. Note the last row and 1847 column correspond to the bias term. 1848 1849 validate_features : 1850 When this is True, validate that the Booster's and data's 1851 feature_names are identical. Otherwise, it is assumed that the 1852 feature_names are the same. 1853 1854 training : 1855 Whether the prediction value is used for training. This can effect `dart` 1856 booster, which performs dropouts during training iterations but use all trees 1857 for inference. If you want to obtain result with dropouts, set this parameter 1858 to `True`. Also, the parameter is set to true when obtaining prediction for 1859 custom objective function. 1860 1861 .. versionadded:: 1.0.0 1862 1863 iteration_range : 1864 Specifies which layer of trees are used in prediction. For example, if a 1865 random forest is trained with 100 rounds. Specifying `iteration_range=(10, 1866 20)`, then only the forests built during [10, 20) (half open set) rounds are 1867 used in this prediction. 1868 1869 .. versionadded:: 1.4.0 1870 1871 strict_shape : 1872 When set to True, output shape is invariant to whether classification is used. 1873 For both value and margin prediction, the output shape is (n_samples, 1874 n_groups), n_groups == 1 when multi-class is not used. Default to False, in 1875 which case the output shape can be (n_samples, ) if multi-class is not used. 1876 1877 .. versionadded:: 1.4.0 1878 1879 Returns 1880 ------- 1881 prediction : numpy array 1882 1883 """ 1884 if not isinstance(data, DMatrix): 1885 raise TypeError('Expecting data to be a DMatrix object, got: ', type(data)) 1886 if validate_features: 1887 self._validate_features(data) 1888 iteration_range = _convert_ntree_limit(self, ntree_limit, iteration_range) 1889 args = { 1890 "type": 0, 1891 "training": training, 1892 "iteration_begin": iteration_range[0], 1893 "iteration_end": iteration_range[1], 1894 "strict_shape": strict_shape, 1895 } 1896 1897 def assign_type(t: int) -> None: 1898 if args["type"] != 0: 1899 raise ValueError("One type of prediction at a time.") 1900 args["type"] = t 1901 1902 if output_margin: 1903 assign_type(1) 1904 if pred_contribs: 1905 assign_type(2 if not approx_contribs else 3) 1906 if pred_interactions: 1907 assign_type(4 if not approx_contribs else 5) 1908 if pred_leaf: 1909 assign_type(6) 1910 preds = ctypes.POINTER(ctypes.c_float)() 1911 shape = ctypes.POINTER(c_bst_ulong)() 1912 dims = c_bst_ulong() 1913 _check_call( 1914 _LIB.XGBoosterPredictFromDMatrix( 1915 self.handle, 1916 data.handle, 1917 from_pystr_to_cstr(json.dumps(args)), 1918 ctypes.byref(shape), 1919 ctypes.byref(dims), 1920 ctypes.byref(preds) 1921 ) 1922 ) 1923 return _prediction_output(shape, dims, preds, False) 1924 1925 def inplace_predict( 1926 self, 1927 data: Any, 1928 iteration_range: Tuple[int, int] = (0, 0), 1929 predict_type: str = "value", 1930 missing: float = np.nan, 1931 validate_features: bool = True, 1932 base_margin: Any = None, 1933 strict_shape: bool = False 1934 ): 1935 """Run prediction in-place, Unlike ``predict`` method, inplace prediction does 1936 not cache the prediction result. 1937 1938 Calling only ``inplace_predict`` in multiple threads is safe and lock 1939 free. But the safety does not hold when used in conjunction with other 1940 methods. E.g. you can't train the booster in one thread and perform 1941 prediction in the other. 1942 1943 .. code-block:: python 1944 1945 booster.set_param({'predictor': 'gpu_predictor'}) 1946 booster.inplace_predict(cupy_array) 1947 1948 booster.set_param({'predictor': 'cpu_predictor}) 1949 booster.inplace_predict(numpy_array) 1950 1951 .. versionadded:: 1.1.0 1952 1953 Parameters 1954 ---------- 1955 data : numpy.ndarray/scipy.sparse.csr_matrix/cupy.ndarray/ 1956 cudf.DataFrame/pd.DataFrame 1957 The input data, must not be a view for numpy array. Set 1958 ``predictor`` to ``gpu_predictor`` for running prediction on CuPy 1959 array or CuDF DataFrame. 1960 iteration_range : 1961 See :py:meth:`xgboost.Booster.predict` for details. 1962 predict_type : 1963 * `value` Output model prediction values. 1964 * `margin` Output the raw untransformed margin value. 1965 missing : 1966 See :py:obj:`xgboost.DMatrix` for details. 1967 validate_features: 1968 See :py:meth:`xgboost.Booster.predict` for details. 1969 base_margin: 1970 See :py:obj:`xgboost.DMatrix` for details. 1971 1972 .. versionadded:: 1.4.0 1973 1974 strict_shape: 1975 See :py:meth:`xgboost.Booster.predict` for details. 1976 1977 .. versionadded:: 1.4.0 1978 1979 Returns 1980 ------- 1981 prediction : numpy.ndarray/cupy.ndarray 1982 The prediction result. When input data is on GPU, prediction 1983 result is stored in a cupy array. 1984 1985 """ 1986 preds = ctypes.POINTER(ctypes.c_float)() 1987 1988 # once caching is supported, we can pass id(data) as cache id. 1989 args = { 1990 "type": 0, 1991 "training": False, 1992 "iteration_begin": iteration_range[0], 1993 "iteration_end": iteration_range[1], 1994 "missing": missing, 1995 "strict_shape": strict_shape, 1996 "cache_id": 0, 1997 } 1998 if predict_type == "margin": 1999 args["type"] = 1 2000 shape = ctypes.POINTER(c_bst_ulong)() 2001 dims = c_bst_ulong() 2002 2003 if base_margin is not None: 2004 proxy: Optional[_ProxyDMatrix] = _ProxyDMatrix() 2005 assert proxy is not None 2006 proxy.set_info(base_margin=base_margin) 2007 p_handle = proxy.handle 2008 else: 2009 proxy = None 2010 p_handle = ctypes.c_void_p() 2011 assert proxy is None or isinstance(proxy, _ProxyDMatrix) 2012 if validate_features: 2013 if not hasattr(data, "shape"): 2014 raise TypeError( 2015 "`shape` attribute is required when `validate_features` is True." 2016 ) 2017 if len(data.shape) != 1 and self.num_features() != data.shape[1]: 2018 raise ValueError( 2019 f"Feature shape mismatch, expected: {self.num_features()}, " 2020 f"got {data.shape[1]}" 2021 ) 2022 2023 from .data import _is_pandas_df, _transform_pandas_df 2024 from .data import _array_interface 2025 if ( 2026 _is_pandas_df(data) 2027 or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame") 2028 ): 2029 ft = self.feature_types 2030 if ft is None: 2031 enable_categorical = False 2032 else: 2033 enable_categorical = any(f == "c" for f in ft) 2034 if _is_pandas_df(data): 2035 data, _, _ = _transform_pandas_df(data, enable_categorical) 2036 2037 if isinstance(data, np.ndarray): 2038 from .data import _ensure_np_dtype 2039 data, _ = _ensure_np_dtype(data, data.dtype) 2040 _check_call( 2041 _LIB.XGBoosterPredictFromDense( 2042 self.handle, 2043 _array_interface(data), 2044 from_pystr_to_cstr(json.dumps(args)), 2045 p_handle, 2046 ctypes.byref(shape), 2047 ctypes.byref(dims), 2048 ctypes.byref(preds), 2049 ) 2050 ) 2051 return _prediction_output(shape, dims, preds, False) 2052 if isinstance(data, scipy.sparse.csr_matrix): 2053 csr = data 2054 _check_call( 2055 _LIB.XGBoosterPredictFromCSR( 2056 self.handle, 2057 _array_interface(csr.indptr), 2058 _array_interface(csr.indices), 2059 _array_interface(csr.data), 2060 ctypes.c_size_t(csr.shape[1]), 2061 from_pystr_to_cstr(json.dumps(args)), 2062 p_handle, 2063 ctypes.byref(shape), 2064 ctypes.byref(dims), 2065 ctypes.byref(preds), 2066 ) 2067 ) 2068 return _prediction_output(shape, dims, preds, False) 2069 if lazy_isinstance(data, "cupy.core.core", "ndarray") or lazy_isinstance( 2070 data, "cupy._core.core", "ndarray" 2071 ): 2072 from .data import _transform_cupy_array 2073 2074 data = _transform_cupy_array(data) 2075 interface_str = _cuda_array_interface(data) 2076 _check_call( 2077 _LIB.XGBoosterPredictFromCudaArray( 2078 self.handle, 2079 interface_str, 2080 from_pystr_to_cstr(json.dumps(args)), 2081 p_handle, 2082 ctypes.byref(shape), 2083 ctypes.byref(dims), 2084 ctypes.byref(preds), 2085 ) 2086 ) 2087 return _prediction_output(shape, dims, preds, True) 2088 if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"): 2089 from .data import _cudf_array_interfaces, _transform_cudf_df 2090 data, cat_codes, _, _ = _transform_cudf_df( 2091 data, None, None, enable_categorical 2092 ) 2093 interfaces_str = _cudf_array_interfaces(data, cat_codes) 2094 _check_call( 2095 _LIB.XGBoosterPredictFromCudaColumnar( 2096 self.handle, 2097 interfaces_str, 2098 from_pystr_to_cstr(json.dumps(args)), 2099 p_handle, 2100 ctypes.byref(shape), 2101 ctypes.byref(dims), 2102 ctypes.byref(preds), 2103 ) 2104 ) 2105 return _prediction_output(shape, dims, preds, True) 2106 2107 raise TypeError( 2108 "Data type:" + str(type(data)) + " not supported by inplace prediction." 2109 ) 2110 2111 def save_model(self, fname: Union[str, os.PathLike]): 2112 """Save the model to a file. 2113 2114 The model is saved in an XGBoost internal format which is universal among the 2115 various XGBoost interfaces. Auxiliary attributes of the Python Booster object 2116 (such as feature_names) will not be saved when using binary format. To save those 2117 attributes, use JSON instead. See: `Model IO 2118 <https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html>`_ for more 2119 info. 2120 2121 Parameters 2122 ---------- 2123 fname : string or os.PathLike 2124 Output file name 2125 2126 """ 2127 if isinstance(fname, (STRING_TYPES, os.PathLike)): # assume file name 2128 fname = os.fspath(os.path.expanduser(fname)) 2129 _check_call(_LIB.XGBoosterSaveModel( 2130 self.handle, c_str(fname))) 2131 else: 2132 raise TypeError("fname must be a string or os PathLike") 2133 2134 def save_raw(self): 2135 """Save the model to a in memory buffer representation instead of file. 2136 2137 Returns 2138 ------- 2139 a in memory buffer representation of the model 2140 """ 2141 length = c_bst_ulong() 2142 cptr = ctypes.POINTER(ctypes.c_char)() 2143 _check_call(_LIB.XGBoosterGetModelRaw(self.handle, 2144 ctypes.byref(length), 2145 ctypes.byref(cptr))) 2146 return ctypes2buffer(cptr, length.value) 2147 2148 def load_model(self, fname: Union[str, bytearray, os.PathLike]) -> None: 2149 """Load the model from a file or bytearray. Path to file can be local 2150 or as an URI. 2151 2152 The model is loaded from XGBoost format which is universal among the various 2153 XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as 2154 feature_names) will not be loaded when using binary format. To save those 2155 attributes, use JSON instead. See: `Model IO 2156 <https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html>`_ for more 2157 info. 2158 2159 Parameters 2160 ---------- 2161 fname : 2162 Input file name or memory buffer(see also save_raw) 2163 2164 """ 2165 if isinstance(fname, (str, os.PathLike)): 2166 # assume file name, cannot use os.path.exist to check, file can be 2167 # from URL. 2168 fname = os.fspath(os.path.expanduser(fname)) 2169 _check_call(_LIB.XGBoosterLoadModel( 2170 self.handle, c_str(fname))) 2171 elif isinstance(fname, bytearray): 2172 buf = fname 2173 length = c_bst_ulong(len(buf)) 2174 ptr = (ctypes.c_char * len(buf)).from_buffer(buf) 2175 _check_call(_LIB.XGBoosterLoadModelFromBuffer(self.handle, ptr, 2176 length)) 2177 else: 2178 raise TypeError('Unknown file type: ', fname) 2179 2180 if self.attr("best_iteration") is not None: 2181 self.best_iteration = int(self.attr("best_iteration")) 2182 if self.attr("best_score") is not None: 2183 self.best_score = float(self.attr("best_score")) 2184 if self.attr("best_ntree_limit") is not None: 2185 self.best_ntree_limit = int(self.attr("best_ntree_limit")) 2186 2187 def num_boosted_rounds(self) -> int: 2188 '''Get number of boosted rounds. For gblinear this is reset to 0 after 2189 serializing the model. 2190 2191 ''' 2192 rounds = ctypes.c_int() 2193 assert self.handle is not None 2194 _check_call(_LIB.XGBoosterBoostedRounds(self.handle, ctypes.byref(rounds))) 2195 return rounds.value 2196 2197 def num_features(self) -> int: 2198 '''Number of features in booster.''' 2199 features = ctypes.c_int() 2200 assert self.handle is not None 2201 _check_call(_LIB.XGBoosterGetNumFeature(self.handle, ctypes.byref(features))) 2202 return features.value 2203 2204 def dump_model(self, fout, fmap='', with_stats=False, dump_format="text"): 2205 """Dump model into a text or JSON file. Unlike `save_model`, the 2206 output format is primarily used for visualization or interpretation, 2207 hence it's more human readable but cannot be loaded back to XGBoost. 2208 2209 Parameters 2210 ---------- 2211 fout : string or os.PathLike 2212 Output file name. 2213 fmap : string or os.PathLike, optional 2214 Name of the file containing feature map names. 2215 with_stats : bool, optional 2216 Controls whether the split statistics are output. 2217 dump_format : string, optional 2218 Format of model dump file. Can be 'text' or 'json'. 2219 """ 2220 if isinstance(fout, (STRING_TYPES, os.PathLike)): 2221 fout = os.fspath(os.path.expanduser(fout)) 2222 # pylint: disable=consider-using-with 2223 fout = open(fout, 'w', encoding="utf-8") 2224 need_close = True 2225 else: 2226 need_close = False 2227 ret = self.get_dump(fmap, with_stats, dump_format) 2228 if dump_format == 'json': 2229 fout.write('[\n') 2230 for i, _ in enumerate(ret): 2231 fout.write(ret[i]) 2232 if i < len(ret) - 1: 2233 fout.write(",\n") 2234 fout.write('\n]') 2235 else: 2236 for i, _ in enumerate(ret): 2237 fout.write(f"booster[{i}]:\n") 2238 fout.write(ret[i]) 2239 if need_close: 2240 fout.close() 2241 2242 def get_dump(self, fmap='', with_stats=False, dump_format="text"): 2243 """Returns the model dump as a list of strings. Unlike `save_model`, the 2244 output format is primarily used for visualization or interpretation, 2245 hence it's more human readable but cannot be loaded back to XGBoost. 2246 2247 Parameters 2248 ---------- 2249 fmap : string or os.PathLike, optional 2250 Name of the file containing feature map names. 2251 with_stats : bool, optional 2252 Controls whether the split statistics are output. 2253 dump_format : string, optional 2254 Format of model dump. Can be 'text', 'json' or 'dot'. 2255 2256 """ 2257 fmap = os.fspath(os.path.expanduser(fmap)) 2258 length = c_bst_ulong() 2259 sarr = ctypes.POINTER(ctypes.c_char_p)() 2260 _check_call(_LIB.XGBoosterDumpModelEx(self.handle, 2261 c_str(fmap), 2262 ctypes.c_int(with_stats), 2263 c_str(dump_format), 2264 ctypes.byref(length), 2265 ctypes.byref(sarr))) 2266 res = from_cstr_to_pystr(sarr, length) 2267 return res 2268 2269 def get_fscore(self, fmap=''): 2270 """Get feature importance of each feature. 2271 2272 .. note:: Zero-importance features will not be included 2273 2274 Keep in mind that this function does not include zero-importance feature, i.e. 2275 those features that have not been used in any split conditions. 2276 2277 Parameters 2278 ---------- 2279 fmap: str or os.PathLike (optional) 2280 The name of feature map file 2281 """ 2282 2283 return self.get_score(fmap, importance_type='weight') 2284 2285 def get_score( 2286 self, fmap: Union[str, os.PathLike] = '', importance_type: str = 'weight' 2287 ) -> Dict[str, Union[float, List[float]]]: 2288 """Get feature importance of each feature. 2289 For tree model Importance type can be defined as: 2290 2291 * 'weight': the number of times a feature is used to split the data across all trees. 2292 * 'gain': the average gain across all splits the feature is used in. 2293 * 'cover': the average coverage across all splits the feature is used in. 2294 * 'total_gain': the total gain across all splits the feature is used in. 2295 * 'total_cover': the total coverage across all splits the feature is used in. 2296 2297 .. note:: 2298 2299 For linear model, only "weight" is defined and it's the normalized coefficients 2300 without bias. 2301 2302 .. note:: Zero-importance features will not be included 2303 2304 Keep in mind that this function does not include zero-importance feature, i.e. 2305 those features that have not been used in any split conditions. 2306 2307 Parameters 2308 ---------- 2309 fmap: str or os.PathLike (optional) 2310 The name of feature map file. 2311 importance_type: str, default 'weight' 2312 One of the importance types defined above. 2313 2314 Returns 2315 ------- 2316 A map between feature names and their scores. When `gblinear` is used for 2317 multi-class classification the scores for each feature is a list with length 2318 `n_classes`, otherwise they're scalars. 2319 """ 2320 fmap = os.fspath(os.path.expanduser(fmap)) 2321 args = from_pystr_to_cstr( 2322 json.dumps({"importance_type": importance_type, "feature_map": fmap}) 2323 ) 2324 features = ctypes.POINTER(ctypes.c_char_p)() 2325 scores = ctypes.POINTER(ctypes.c_float)() 2326 n_out_features = c_bst_ulong() 2327 out_dim = c_bst_ulong() 2328 shape = ctypes.POINTER(c_bst_ulong)() 2329 2330 _check_call( 2331 _LIB.XGBoosterFeatureScore( 2332 self.handle, 2333 args, 2334 ctypes.byref(n_out_features), 2335 ctypes.byref(features), 2336 ctypes.byref(out_dim), 2337 ctypes.byref(shape), 2338 ctypes.byref(scores), 2339 ) 2340 ) 2341 features_arr = from_cstr_to_pystr(features, n_out_features) 2342 scores_arr = _prediction_output(shape, out_dim, scores, False) 2343 2344 results: Dict[str, Union[float, List[float]]] = {} 2345 if len(scores_arr.shape) > 1 and scores_arr.shape[1] > 1: 2346 for feat, score in zip(features_arr, scores_arr): 2347 results[feat] = [float(s) for s in score] 2348 else: 2349 for feat, score in zip(features_arr, scores_arr): 2350 results[feat] = float(score) 2351 return results 2352 2353 def trees_to_dataframe(self, fmap=''): # pylint: disable=too-many-statements 2354 """Parse a boosted tree model text dump into a pandas DataFrame structure. 2355 2356 This feature is only defined when the decision tree model is chosen as base 2357 learner (`booster in {gbtree, dart}`). It is not defined for other base learner 2358 types, such as linear learners (`booster=gblinear`). 2359 2360 Parameters 2361 ---------- 2362 fmap: str or os.PathLike (optional) 2363 The name of feature map file. 2364 """ 2365 # pylint: disable=too-many-locals 2366 fmap = os.fspath(os.path.expanduser(fmap)) 2367 if not PANDAS_INSTALLED: 2368 raise ImportError(('pandas must be available to use this method.' 2369 'Install pandas before calling again.')) 2370 2371 if getattr(self, 'booster', None) is not None and self.booster not in {'gbtree', 'dart'}: 2372 raise ValueError( 2373 f"This method is not defined for Booster type {self.booster}" 2374 ) 2375 2376 tree_ids = [] 2377 node_ids = [] 2378 fids = [] 2379 splits = [] 2380 categories = [] 2381 y_directs = [] 2382 n_directs = [] 2383 missings = [] 2384 gains = [] 2385 covers = [] 2386 2387 trees = self.get_dump(fmap, with_stats=True) 2388 for i, tree in enumerate(trees): 2389 for line in tree.split('\n'): 2390 arr = line.split('[') 2391 # Leaf node 2392 if len(arr) == 1: 2393 # Last element of line.split is an empy string 2394 if arr == ['']: 2395 continue 2396 # parse string 2397 parse = arr[0].split(':') 2398 stats = re.split('=|,', parse[1]) 2399 2400 # append to lists 2401 tree_ids.append(i) 2402 node_ids.append(int(re.findall(r'\b\d+\b', parse[0])[0])) 2403 fids.append('Leaf') 2404 splits.append(float('NAN')) 2405 categories.append(float('NAN')) 2406 y_directs.append(float('NAN')) 2407 n_directs.append(float('NAN')) 2408 missings.append(float('NAN')) 2409 gains.append(float(stats[1])) 2410 covers.append(float(stats[3])) 2411 # Not a Leaf Node 2412 else: 2413 # parse string 2414 fid = arr[1].split(']') 2415 if fid[0].find("<") != -1: 2416 # numerical 2417 parse = fid[0].split('<') 2418 splits.append(float(parse[1])) 2419 categories.append(None) 2420 elif fid[0].find(":{") != -1: 2421 # categorical 2422 parse = fid[0].split(":") 2423 cats = parse[1][1:-1] # strip the {} 2424 cats = cats.split(",") 2425 splits.append(float("NAN")) 2426 categories.append(cats if cats else None) 2427 else: 2428 raise ValueError("Failed to parse model text dump.") 2429 stats = re.split('=|,', fid[1]) 2430 2431 # append to lists 2432 tree_ids.append(i) 2433 node_ids.append(int(re.findall(r'\b\d+\b', arr[0])[0])) 2434 fids.append(parse[0]) 2435 str_i = str(i) 2436 y_directs.append(str_i + '-' + stats[1]) 2437 n_directs.append(str_i + '-' + stats[3]) 2438 missings.append(str_i + '-' + stats[5]) 2439 gains.append(float(stats[7])) 2440 covers.append(float(stats[9])) 2441 2442 ids = [str(t_id) + '-' + str(n_id) for t_id, n_id in zip(tree_ids, node_ids)] 2443 df = DataFrame({'Tree': tree_ids, 'Node': node_ids, 'ID': ids, 2444 'Feature': fids, 'Split': splits, 'Yes': y_directs, 2445 'No': n_directs, 'Missing': missings, 'Gain': gains, 2446 'Cover': covers, "Category": categories}) 2447 2448 if callable(getattr(df, 'sort_values', None)): 2449 # pylint: disable=no-member 2450 return df.sort_values(['Tree', 'Node']).reset_index(drop=True) 2451 # pylint: disable=no-member 2452 return df.sort(['Tree', 'Node']).reset_index(drop=True) 2453 2454 def _validate_features(self, data: DMatrix): 2455 """ 2456 Validate Booster and data's feature_names are identical. 2457 Set feature_names and feature_types from DMatrix 2458 """ 2459 if data.num_row() == 0: 2460 return 2461 2462 if self.feature_names is None: 2463 self.feature_names = data.feature_names 2464 self.feature_types = data.feature_types 2465 if data.feature_names is None and self.feature_names is not None: 2466 raise ValueError( 2467 "training data did not have the following fields: " + 2468 ", ".join(self.feature_names) 2469 ) 2470 # Booster can't accept data with different feature names 2471 if self.feature_names != data.feature_names: 2472 dat_missing = set(self.feature_names) - set(data.feature_names) 2473 my_missing = set(data.feature_names) - set(self.feature_names) 2474 2475 msg = 'feature_names mismatch: {0} {1}' 2476 2477 if dat_missing: 2478 msg += ('\nexpected ' + ', '.join( 2479 str(s) for s in dat_missing) + ' in input data') 2480 2481 if my_missing: 2482 msg += ('\ntraining data did not have the following fields: ' + 2483 ', '.join(str(s) for s in my_missing)) 2484 2485 raise ValueError(msg.format(self.feature_names, data.feature_names)) 2486 2487 def get_split_value_histogram( 2488 self, 2489 feature: str, 2490 fmap: Union[os.PathLike, str] = '', 2491 bins: Optional[int] = None, 2492 as_pandas: bool = True 2493 ) -> Union[np.ndarray, DataFrame]: 2494 """Get split value histogram of a feature 2495 2496 Parameters 2497 ---------- 2498 feature: str 2499 The name of the feature. 2500 fmap: str or os.PathLike (optional) 2501 The name of feature map file. 2502 bin: int, default None 2503 The maximum number of bins. 2504 Number of bins equals number of unique split values n_unique, 2505 if bins == None or bins > n_unique. 2506 as_pandas: bool, default True 2507 Return pd.DataFrame when pandas is installed. 2508 If False or pandas is not installed, return numpy ndarray. 2509 2510 Returns 2511 ------- 2512 a histogram of used splitting values for the specified feature 2513 either as numpy array or pandas DataFrame. 2514 """ 2515 xgdump = self.get_dump(fmap=fmap) 2516 values = [] 2517 # pylint: disable=consider-using-f-string 2518 regexp = re.compile(r"\[{0}<([\d.Ee+-]+)\]".format(feature)) 2519 for i, _ in enumerate(xgdump): 2520 m = re.findall(regexp, xgdump[i]) 2521 values.extend([float(x) for x in m]) 2522 2523 n_unique = len(np.unique(values)) 2524 bins = max(min(n_unique, bins) if bins is not None else n_unique, 1) 2525 2526 nph = np.histogram(values, bins=bins) 2527 nph = np.column_stack((nph[1][1:], nph[0])) 2528 nph = nph[nph[:, 1] > 0] 2529 2530 if nph.size == 0: 2531 ft = self.feature_types 2532 fn = self.feature_names 2533 if fn is None: 2534 # Let xgboost generate the feature names. 2535 fn = [f"f{i}" for i in range(self.num_features())] 2536 try: 2537 index = fn.index(feature) 2538 feature_t: Optional[str] = cast(List[str], ft)[index] 2539 except (ValueError, AttributeError, TypeError): 2540 # None.index: attr err, None[0]: type err, fn.index(-1): value err 2541 feature_t = None 2542 if feature_t == "c": # categorical 2543 raise ValueError( 2544 "Split value historgam doesn't support categorical split." 2545 ) 2546 2547 if as_pandas and PANDAS_INSTALLED: 2548 return DataFrame(nph, columns=['SplitValue', 'Count']) 2549 if as_pandas and not PANDAS_INSTALLED: 2550 warnings.warn( 2551 "Returning histogram as ndarray" 2552 " (as_pandas == True, but pandas is not installed).", 2553 UserWarning 2554 ) 2555 return nph 2556