1# coding: utf-8
2# pylint: disable=too-many-arguments, too-many-branches, invalid-name
3# pylint: disable=too-many-lines, too-many-locals, no-self-use
4"""Core XGBoost Library."""
5import collections
6# pylint: disable=no-name-in-module,import-error
7from collections.abc import Mapping
8from typing import List, Optional, Any, Union, Dict, TypeVar
9# pylint: enable=no-name-in-module,import-error
10from typing import Callable, Tuple, cast
11import ctypes
12import os
13import re
14import sys
15import json
16import warnings
17from functools import wraps
18from inspect import signature, Parameter
19
20import numpy as np
21import scipy.sparse
22
23from .compat import (STRING_TYPES, DataFrame, py_str, PANDAS_INSTALLED,
24                     lazy_isinstance)
25from .libpath import find_lib_path
26
27# c_bst_ulong corresponds to bst_ulong defined in xgboost/c_api.h
28c_bst_ulong = ctypes.c_uint64
29
30
31class XGBoostError(ValueError):
32    """Error thrown by xgboost trainer."""
33
34
35class EarlyStopException(Exception):
36    """Exception to signal early stopping.
37
38    Parameters
39    ----------
40    best_iteration : int
41        The best iteration stopped.
42    """
43
44    def __init__(self, best_iteration):
45        super().__init__()
46        self.best_iteration = best_iteration
47
48
49# Callback environment used by callbacks
50CallbackEnv = collections.namedtuple(
51    "XGBoostCallbackEnv",
52    ["model",
53     "cvfolds",
54     "iteration",
55     "begin_iteration",
56     "end_iteration",
57     "rank",
58     "evaluation_result_list"])
59
60
61def from_pystr_to_cstr(data: Union[str, List[str]]):
62    """Convert a Python str or list of Python str to C pointer
63
64    Parameters
65    ----------
66    data
67        str or list of str
68    """
69
70    if isinstance(data, str):
71        return bytes(data, "utf-8")
72    if isinstance(data, list):
73        pointers = (ctypes.c_char_p * len(data))()
74        data = [bytes(d, 'utf-8') for d in data]
75        pointers[:] = data
76        return pointers
77    raise TypeError()
78
79
80def from_cstr_to_pystr(data, length) -> List[str]:
81    """Revert C pointer to Python str
82
83    Parameters
84    ----------
85    data : ctypes pointer
86        pointer to data
87    length : ctypes pointer
88        pointer to length of data
89    """
90    res = []
91    for i in range(length.value):
92        try:
93            res.append(str(data[i].decode('ascii')))
94        except UnicodeDecodeError:
95            res.append(str(data[i].decode('utf-8')))
96    return res
97
98
99def _convert_ntree_limit(
100    booster: "Booster",
101    ntree_limit: Optional[int],
102    iteration_range: Optional[Tuple[int, int]]
103) -> Optional[Tuple[int, int]]:
104    if ntree_limit is not None and ntree_limit != 0:
105        warnings.warn(
106            "ntree_limit is deprecated, use `iteration_range` or model "
107            "slicing instead.",
108            UserWarning
109        )
110        if iteration_range is not None and iteration_range[1] != 0:
111            raise ValueError(
112                "Only one of `iteration_range` and `ntree_limit` can be non zero."
113            )
114        num_parallel_tree, _ = _get_booster_layer_trees(booster)
115        num_parallel_tree = max([num_parallel_tree, 1])
116        iteration_range = (0, ntree_limit // num_parallel_tree)
117    return iteration_range
118
119
120def _expect(expectations, got):
121    """Translate input error into string.
122
123    Parameters
124    ----------
125    expectations: sequence
126        a list of expected value.
127    got:
128        actual input
129
130    Returns
131    -------
132    msg: str
133    """
134    msg = 'Expecting '
135    for t in range(len(expectations) - 1):
136        msg += str(expectations[t])
137        msg += ' or '
138    msg += str(expectations[-1])
139    msg += '.  Got ' + str(got)
140    return msg
141
142
143def _log_callback(msg: bytes) -> None:
144    """Redirect logs from native library into Python console"""
145    print(py_str(msg))
146
147
148def _get_log_callback_func():
149    """Wrap log_callback() method in ctypes callback type"""
150    # pylint: disable=invalid-name
151    CALLBACK = ctypes.CFUNCTYPE(None, ctypes.c_char_p)
152    return CALLBACK(_log_callback)
153
154
155def _load_lib():
156    """Load xgboost Library."""
157    lib_paths = find_lib_path()
158    if not lib_paths:
159        return None
160    try:
161        pathBackup = os.environ['PATH'].split(os.pathsep)
162    except KeyError:
163        pathBackup = []
164    lib_success = False
165    os_error_list = []
166    for lib_path in lib_paths:
167        try:
168            # needed when the lib is linked with non-system-available
169            # dependencies
170            os.environ['PATH'] = os.pathsep.join(
171                pathBackup + [os.path.dirname(lib_path)])
172            lib = ctypes.cdll.LoadLibrary(lib_path)
173            lib_success = True
174        except OSError as e:
175            os_error_list.append(str(e))
176            continue
177        finally:
178            os.environ['PATH'] = os.pathsep.join(pathBackup)
179    if not lib_success:
180        libname = os.path.basename(lib_paths[0])
181        raise XGBoostError(
182            f"""
183XGBoost Library ({libname}) could not be loaded.
184Likely causes:
185  * OpenMP runtime is not installed
186    - vcomp140.dll or libgomp-1.dll for Windows
187    - libomp.dylib for Mac OSX
188    - libgomp.so for Linux and other UNIX-like OSes
189    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
190
191  * You are running 32-bit Python on a 64-bit OS
192
193Error message(s): {os_error_list}
194""")
195    lib.XGBGetLastError.restype = ctypes.c_char_p
196    lib.callback = _get_log_callback_func()
197    if lib.XGBRegisterLogCallback(lib.callback) != 0:
198        raise XGBoostError(lib.XGBGetLastError())
199    return lib
200
201
202# load the XGBoost library globally
203_LIB = _load_lib()
204
205
206def _check_call(ret):
207    """Check the return value of C API call
208
209    This function will raise exception when error occurs.
210    Wrap every API call with this function
211
212    Parameters
213    ----------
214    ret : int
215        return value from API calls
216    """
217    if ret != 0:
218        raise XGBoostError(py_str(_LIB.XGBGetLastError()))
219
220
221def _numpy2ctypes_type(dtype):
222    _NUMPY_TO_CTYPES_MAPPING = {
223        np.float32: ctypes.c_float,
224        np.float64: ctypes.c_double,
225        np.uint32: ctypes.c_uint,
226        np.uint64: ctypes.c_uint64,
227        np.int32: ctypes.c_int32,
228        np.int64: ctypes.c_int64,
229    }
230    if np.intc is not np.int32:  # Windows
231        _NUMPY_TO_CTYPES_MAPPING[np.intc] = _NUMPY_TO_CTYPES_MAPPING[np.int32]
232    if dtype not in _NUMPY_TO_CTYPES_MAPPING.keys():
233        raise TypeError(
234            f"Supported types: {_NUMPY_TO_CTYPES_MAPPING.keys()}, got: {dtype}"
235        )
236    return _NUMPY_TO_CTYPES_MAPPING[dtype]
237
238
239def _cuda_array_interface(data) -> bytes:
240    assert (
241        data.dtype.hasobject is False
242    ), "Input data contains `object` dtype.  Expecting numeric data."
243    interface = data.__cuda_array_interface__
244    if "mask" in interface:
245        interface["mask"] = interface["mask"].__cuda_array_interface__
246    interface_str = bytes(json.dumps(interface), "utf-8")
247    return interface_str
248
249
250def ctypes2numpy(cptr, length, dtype):
251    """Convert a ctypes pointer array to a numpy array."""
252    ctype = _numpy2ctypes_type(dtype)
253    if not isinstance(cptr, ctypes.POINTER(ctype)):
254        raise RuntimeError(f"expected {ctype} pointer")
255    res = np.zeros(length, dtype=dtype)
256    if not ctypes.memmove(res.ctypes.data, cptr, length * res.strides[0]):
257        raise RuntimeError("memmove failed")
258    return res
259
260
261def ctypes2cupy(cptr, length, dtype):
262    """Convert a ctypes pointer array to a cupy array."""
263    # pylint: disable=import-error
264    import cupy
265    from cupy.cuda.memory import MemoryPointer
266    from cupy.cuda.memory import UnownedMemory
267
268    CUPY_TO_CTYPES_MAPPING = {cupy.float32: ctypes.c_float, cupy.uint32: ctypes.c_uint}
269    if dtype not in CUPY_TO_CTYPES_MAPPING.keys():
270        raise RuntimeError(f"Supported types: {CUPY_TO_CTYPES_MAPPING.keys()}")
271    addr = ctypes.cast(cptr, ctypes.c_void_p).value
272    # pylint: disable=c-extension-no-member,no-member
273    device = cupy.cuda.runtime.pointerGetAttributes(addr).device
274    # The owner field is just used to keep the memory alive with ref count.  As
275    # unowned's life time is scoped within this function we don't need that.
276    unownd = UnownedMemory(
277        addr, length * ctypes.sizeof(CUPY_TO_CTYPES_MAPPING[dtype]), owner=None
278    )
279    memptr = MemoryPointer(unownd, 0)
280    # pylint: disable=unexpected-keyword-arg
281    mem = cupy.ndarray((length,), dtype=dtype, memptr=memptr)
282    assert mem.device.id == device
283    arr = cupy.array(mem, copy=True)
284    return arr
285
286
287def ctypes2buffer(cptr, length):
288    """Convert ctypes pointer to buffer type."""
289    if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
290        raise RuntimeError('expected char pointer')
291    res = bytearray(length)
292    rptr = (ctypes.c_char * length).from_buffer(res)
293    if not ctypes.memmove(rptr, cptr, length):
294        raise RuntimeError('memmove failed')
295    return res
296
297
298def c_str(string):
299    """Convert a python string to cstring."""
300    return ctypes.c_char_p(string.encode('utf-8'))
301
302
303def c_array(ctype, values):
304    """Convert a python string to c array."""
305    if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype):
306        return (ctype * len(values)).from_buffer_copy(values)
307    return (ctype * len(values))(*values)
308
309
310def _prediction_output(shape, dims, predts, is_cuda):
311    arr_shape: np.ndarray = ctypes2numpy(shape, dims.value, np.uint64)
312    length = int(np.prod(arr_shape))
313    if is_cuda:
314        arr_predict = ctypes2cupy(predts, length, np.float32)
315    else:
316        arr_predict: np.ndarray = ctypes2numpy(predts, length, np.float32)
317    arr_predict = arr_predict.reshape(arr_shape)
318    return arr_predict
319
320
321class DataIter:  # pylint: disable=too-many-instance-attributes
322    """The interface for user defined data iterator.
323
324    Parameters
325    ----------
326    cache_prefix:
327        Prefix to the cache files, only used in external memory.  It can be either an URI
328        or a file path.
329
330    """
331    _T = TypeVar("_T")
332
333    def __init__(self, cache_prefix: Optional[str] = None) -> None:
334        self.cache_prefix = cache_prefix
335
336        self._handle = _ProxyDMatrix()
337        self._exception: Optional[Exception] = None
338        self._enable_categorical = False
339        self._allow_host = True
340        # Stage data in Python until reset or next is called to avoid data being free.
341        self._temporary_data = None
342
343    def _get_callbacks(
344        self, allow_host: bool, enable_categorical: bool
345    ) -> Tuple[Callable, Callable]:
346        assert hasattr(self, "cache_prefix"), "__init__ is not called."
347        self._reset_callback = ctypes.CFUNCTYPE(None, ctypes.c_void_p)(
348            self._reset_wrapper
349        )
350        self._next_callback = ctypes.CFUNCTYPE(
351            ctypes.c_int,
352            ctypes.c_void_p,
353        )(self._next_wrapper)
354        self._allow_host = allow_host
355        self._enable_categorical = enable_categorical
356        return self._reset_callback, self._next_callback
357
358    @property
359    def proxy(self) -> "_ProxyDMatrix":
360        """Handle of DMatrix proxy."""
361        return self._handle
362
363    def _handle_exception(self, fn: Callable, dft_ret: _T) -> _T:
364        if self._exception is not None:
365            return dft_ret
366
367        try:
368            return fn()
369        except Exception as e:  # pylint: disable=broad-except
370            # Defer the exception in order to return 0 and stop the iteration.
371            # Exception inside a ctype callback function has no effect except
372            # for printing to stderr (doesn't stop the execution).
373            tb = sys.exc_info()[2]
374            # On dask, the worker is restarted and somehow the information is
375            # lost.
376            self._exception = e.with_traceback(tb)
377        return dft_ret
378
379    def _reraise(self) -> None:
380        self._temporary_data = None
381        if self._exception is not None:
382            #  pylint 2.7.0 believes `self._exception` can be None even with `assert
383            #  isinstace`
384            exc = self._exception
385            self._exception = None
386            raise exc  # pylint: disable=raising-bad-type
387
388    def __del__(self) -> None:
389        assert self._temporary_data is None
390        assert self._exception is None
391
392    def _reset_wrapper(self, this: None) -> None:  # pylint: disable=unused-argument
393        """A wrapper for user defined `reset` function."""
394        # free the data
395        self._temporary_data = None
396        self._handle_exception(self.reset, None)
397
398    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument
399        """A wrapper for user defined `next` function.
400
401        `this` is not used in Python.  ctypes can handle `self` of a Python
402        member function automatically when converting it to c function
403        pointer.
404
405        """
406        @_deprecate_positional_args
407        def data_handle(
408            data: Any,
409            *,
410            feature_names: Optional[List[str]] = None,
411            feature_types: Optional[List[str]] = None,
412            **kwargs: Any,
413        ) -> None:
414            from .data import dispatch_proxy_set_data
415            from .data import _proxy_transform
416
417            new, cat_codes, feature_names, feature_types = _proxy_transform(
418                data,
419                feature_names,
420                feature_types,
421                self._enable_categorical,
422            )
423            # Stage the data, meta info are copied inside C++ MetaInfo.
424            self._temporary_data = (new, cat_codes)
425            dispatch_proxy_set_data(self.proxy, new, cat_codes, self._allow_host)
426            self.proxy.set_info(
427                feature_names=feature_names,
428                feature_types=feature_types,
429                **kwargs,
430            )
431        # pylint: disable=not-callable
432        return self._handle_exception(lambda: self.next(data_handle), 0)
433
434    def reset(self) -> None:
435        """Reset the data iterator.  Prototype for user defined function."""
436        raise NotImplementedError()
437
438    def next(self, input_data: Callable) -> int:
439        """Set the next batch of data.
440
441        Parameters
442        ----------
443
444        data_handle:
445            A function with same data fields like `data`, `label` with
446            `xgboost.DMatrix`.
447
448        Returns
449        -------
450        0 if there's no more batch, otherwise 1.
451
452        """
453        raise NotImplementedError()
454
455
456# Notice for `_deprecate_positional_args`
457# Authors: Olivier Grisel
458#          Gael Varoquaux
459#          Andreas Mueller
460#          Lars Buitinck
461#          Alexandre Gramfort
462#          Nicolas Tresegnie
463#          Sylvain Marie
464# License: BSD 3 clause
465def _deprecate_positional_args(f):
466    """Decorator for methods that issues warnings for positional arguments
467
468    Using the keyword-only argument syntax in pep 3102, arguments after the
469    * will issue a warning when passed as a positional argument.
470
471    Modifed from sklearn utils.validation.
472
473    Parameters
474    ----------
475    f : function
476        function to check arguments on
477    """
478    sig = signature(f)
479    kwonly_args = []
480    all_args = []
481
482    for name, param in sig.parameters.items():
483        if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
484            all_args.append(name)
485        elif param.kind == Parameter.KEYWORD_ONLY:
486            kwonly_args.append(name)
487
488    @wraps(f)
489    def inner_f(*args, **kwargs):
490        extra_args = len(args) - len(all_args)
491        if extra_args > 0:
492            # ignore first 'self' argument for instance methods
493            args_msg = [
494                f"{name}" for name, _ in zip(
495                    kwonly_args[:extra_args], args[-extra_args:]
496                )
497            ]
498            # pylint: disable=consider-using-f-string
499            warnings.warn(
500                "Pass `{}` as keyword args.  Passing these as positional "
501                "arguments will be considered as error in future releases.".
502                format(", ".join(args_msg)), FutureWarning
503            )
504        for k, arg in zip(sig.parameters, args):
505            kwargs[k] = arg
506        return f(**kwargs)
507
508    return inner_f
509
510
511class DMatrix:  # pylint: disable=too-many-instance-attributes
512    """Data Matrix used in XGBoost.
513
514    DMatrix is an internal data structure that is used by XGBoost,
515    which is optimized for both memory efficiency and training speed.
516    You can construct DMatrix from multiple different sources of data.
517    """
518
519    @_deprecate_positional_args
520    def __init__(
521        self,
522        data,
523        label=None,
524        *,
525        weight=None,
526        base_margin=None,
527        missing: Optional[float] = None,
528        silent=False,
529        feature_names: Optional[List[str]] = None,
530        feature_types: Optional[List[str]] = None,
531        nthread: Optional[int] = None,
532        group=None,
533        qid=None,
534        label_lower_bound=None,
535        label_upper_bound=None,
536        feature_weights=None,
537        enable_categorical: bool = False,
538    ) -> None:
539        """Parameters
540        ----------
541        data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
542               dt.Frame/cudf.DataFrame/cupy.array/dlpack
543            Data source of DMatrix.
544            When data is string or os.PathLike type, it represents the path
545            libsvm format txt file, csv file (by specifying uri parameter
546            'path_to_csv?format=csv'), or binary file that xgboost can read
547            from.
548        label : array_like
549            Label of the training data.
550        weight : array_like
551            Weight for each instance.
552
553            .. note:: For ranking task, weights are per-group.
554
555                In ranking task, one weight is assigned to each group (not each
556                data point). This is because we only care about the relative
557                ordering of data points within each group, so it doesn't make
558                sense to assign weights to individual data points.
559
560        base_margin: array_like
561            Base margin used for boosting from existing model.
562        missing : float, optional
563            Value in the input data which needs to be present as a missing
564            value. If None, defaults to np.nan.
565        silent : boolean, optional
566            Whether print messages during construction
567        feature_names : list, optional
568            Set names for features.
569        feature_types :
570
571            Set types for features.  When `enable_categorical` is set to `True`, string
572            "c" represents categorical data type.
573
574        nthread : integer, optional
575            Number of threads to use for loading data when parallelization is
576            applicable. If -1, uses maximum threads available on the system.
577        group : array_like
578            Group size for all ranking group.
579        qid : array_like
580            Query ID for data samples, used for ranking.
581        label_lower_bound : array_like
582            Lower bound for survival training.
583        label_upper_bound : array_like
584            Upper bound for survival training.
585        feature_weights : array_like, optional
586            Set feature weights for column sampling.
587        enable_categorical: boolean, optional
588
589            .. versionadded:: 1.3.0
590
591            Experimental support of specializing for categorical features.  Do not set to
592            True unless you are interested in development.  Currently it's only available
593            for `gpu_hist` tree method with 1 vs rest (one hot) categorical split.  Also,
594            JSON serialization format is required.
595
596        """
597        if group is not None and qid is not None:
598            raise ValueError("Either one of `group` or `qid` should be None.")
599
600        self.missing = missing if missing is not None else np.nan
601        self.nthread = nthread if nthread is not None else -1
602        self.silent = silent
603
604        # force into void_p, mac need to pass things in as void_p
605        if data is None:
606            self.handle = None
607            return
608
609        from .data import dispatch_data_backend, _is_iter
610
611        if _is_iter(data):
612            self._init_from_iter(data, enable_categorical)
613            assert self.handle is not None
614            return
615
616        handle, feature_names, feature_types = dispatch_data_backend(
617            data,
618            missing=self.missing,
619            threads=self.nthread,
620            feature_names=feature_names,
621            feature_types=feature_types,
622            enable_categorical=enable_categorical,
623        )
624        assert handle is not None
625        self.handle = handle
626
627        self.set_info(
628            label=label,
629            weight=weight,
630            base_margin=base_margin,
631            group=group,
632            qid=qid,
633            label_lower_bound=label_lower_bound,
634            label_upper_bound=label_upper_bound,
635            feature_weights=feature_weights,
636        )
637
638        if feature_names is not None:
639            self.feature_names = feature_names
640        if feature_types is not None:
641            self.feature_types = feature_types
642
643    def _init_from_iter(self, iterator: DataIter, enable_categorical: bool):
644        it = iterator
645        args = {
646            "missing": self.missing,
647            "nthread": self.nthread,
648            "cache_prefix": it.cache_prefix if it.cache_prefix else "",
649        }
650        args = from_pystr_to_cstr(json.dumps(args))
651        handle = ctypes.c_void_p()
652        # pylint: disable=protected-access
653        reset_callback, next_callback = it._get_callbacks(
654            True, enable_categorical
655        )
656        ret = _LIB.XGDMatrixCreateFromCallback(
657            None,
658            it.proxy.handle,
659            reset_callback,
660            next_callback,
661            args,
662            ctypes.byref(handle),
663        )
664        # pylint: disable=protected-access
665        it._reraise()
666        # delay check_call to throw intermediate exception first
667        _check_call(ret)
668        self.handle = handle
669
670    def __del__(self):
671        if hasattr(self, "handle") and self.handle:
672            _check_call(_LIB.XGDMatrixFree(self.handle))
673            self.handle = None
674
675    @_deprecate_positional_args
676    def set_info(
677        self,
678        *,
679        label=None,
680        weight=None,
681        base_margin=None,
682        group=None,
683        qid=None,
684        label_lower_bound=None,
685        label_upper_bound=None,
686        feature_names: Optional[List[str]] = None,
687        feature_types: Optional[List[str]] = None,
688        feature_weights=None
689    ) -> None:
690        """Set meta info for DMatrix.  See doc string for :py:obj:`xgboost.DMatrix`."""
691        from .data import dispatch_meta_backend
692
693        if label is not None:
694            self.set_label(label)
695        if weight is not None:
696            self.set_weight(weight)
697        if base_margin is not None:
698            self.set_base_margin(base_margin)
699        if group is not None:
700            self.set_group(group)
701        if qid is not None:
702            self.set_uint_info('qid', qid)
703        if label_lower_bound is not None:
704            self.set_float_info('label_lower_bound', label_lower_bound)
705        if label_upper_bound is not None:
706            self.set_float_info('label_upper_bound', label_upper_bound)
707        if feature_names is not None:
708            self.feature_names = feature_names
709        if feature_types is not None:
710            self.feature_types = feature_types
711        if feature_weights is not None:
712            dispatch_meta_backend(matrix=self, data=feature_weights,
713                                  name='feature_weights')
714
715    def get_float_info(self, field):
716        """Get float property from the DMatrix.
717
718        Parameters
719        ----------
720        field: str
721            The field name of the information
722
723        Returns
724        -------
725        info : array
726            a numpy array of float information of the data
727        """
728        length = c_bst_ulong()
729        ret = ctypes.POINTER(ctypes.c_float)()
730        _check_call(_LIB.XGDMatrixGetFloatInfo(self.handle,
731                                               c_str(field),
732                                               ctypes.byref(length),
733                                               ctypes.byref(ret)))
734        return ctypes2numpy(ret, length.value, np.float32)
735
736    def get_uint_info(self, field):
737        """Get unsigned integer property from the DMatrix.
738
739        Parameters
740        ----------
741        field: str
742            The field name of the information
743
744        Returns
745        -------
746        info : array
747            a numpy array of unsigned integer information of the data
748        """
749        length = c_bst_ulong()
750        ret = ctypes.POINTER(ctypes.c_uint)()
751        _check_call(_LIB.XGDMatrixGetUIntInfo(self.handle,
752                                              c_str(field),
753                                              ctypes.byref(length),
754                                              ctypes.byref(ret)))
755        return ctypes2numpy(ret, length.value, np.uint32)
756
757    def set_float_info(self, field, data):
758        """Set float type property into the DMatrix.
759
760        Parameters
761        ----------
762        field: str
763            The field name of the information
764
765        data: numpy array
766            The array of data to be set
767        """
768        from .data import dispatch_meta_backend
769        dispatch_meta_backend(self, data, field, 'float')
770
771    def set_float_info_npy2d(self, field, data):
772        """Set float type property into the DMatrix
773           for numpy 2d array input
774
775        Parameters
776        ----------
777        field: str
778            The field name of the information
779
780        data: numpy array
781            The array of data to be set
782        """
783        from .data import dispatch_meta_backend
784        dispatch_meta_backend(self, data, field, 'float')
785
786    def set_uint_info(self, field, data):
787        """Set uint type property into the DMatrix.
788
789        Parameters
790        ----------
791        field: str
792            The field name of the information
793
794        data: numpy array
795            The array of data to be set
796        """
797        from .data import dispatch_meta_backend
798        dispatch_meta_backend(self, data, field, 'uint32')
799
800    def save_binary(self, fname, silent=True):
801        """Save DMatrix to an XGBoost buffer.  Saved binary can be later loaded
802        by providing the path to :py:func:`xgboost.DMatrix` as input.
803
804        Parameters
805        ----------
806        fname : string or os.PathLike
807            Name of the output buffer file.
808        silent : bool (optional; default: True)
809            If set, the output is suppressed.
810        """
811        fname = os.fspath(os.path.expanduser(fname))
812        _check_call(_LIB.XGDMatrixSaveBinary(self.handle,
813                                             c_str(fname),
814                                             ctypes.c_int(silent)))
815
816    def set_label(self, label):
817        """Set label of dmatrix
818
819        Parameters
820        ----------
821        label: array like
822            The label information to be set into DMatrix
823        """
824        from .data import dispatch_meta_backend
825        dispatch_meta_backend(self, label, 'label', 'float')
826
827    def set_weight(self, weight):
828        """Set weight of each instance.
829
830        Parameters
831        ----------
832        weight : array like
833            Weight for each data point
834
835            .. note:: For ranking task, weights are per-group.
836
837                In ranking task, one weight is assigned to each group (not each
838                data point). This is because we only care about the relative
839                ordering of data points within each group, so it doesn't make
840                sense to assign weights to individual data points.
841
842        """
843        from .data import dispatch_meta_backend
844        dispatch_meta_backend(self, weight, 'weight', 'float')
845
846    def set_base_margin(self, margin):
847        """Set base margin of booster to start from.
848
849        This can be used to specify a prediction value of existing model to be
850        base_margin However, remember margin is needed, instead of transformed
851        prediction e.g. for logistic regression: need to put in value before
852        logistic transformation see also example/demo.py
853
854        Parameters
855        ----------
856        margin: array like
857            Prediction margin of each datapoint
858
859        """
860        from .data import dispatch_meta_backend
861        dispatch_meta_backend(self, margin, 'base_margin', 'float')
862
863    def set_group(self, group):
864        """Set group size of DMatrix (used for ranking).
865
866        Parameters
867        ----------
868        group : array like
869            Group size of each group
870        """
871        from .data import dispatch_meta_backend
872        dispatch_meta_backend(self, group, 'group', 'uint32')
873
874    def get_label(self):
875        """Get the label of the DMatrix.
876
877        Returns
878        -------
879        label : array
880        """
881        return self.get_float_info('label')
882
883    def get_weight(self):
884        """Get the weight of the DMatrix.
885
886        Returns
887        -------
888        weight : array
889        """
890        return self.get_float_info('weight')
891
892    def get_base_margin(self):
893        """Get the base margin of the DMatrix.
894
895        Returns
896        -------
897        base_margin : float
898        """
899        return self.get_float_info('base_margin')
900
901    def num_row(self):
902        """Get the number of rows in the DMatrix.
903
904        Returns
905        -------
906        number of rows : int
907        """
908        ret = c_bst_ulong()
909        _check_call(_LIB.XGDMatrixNumRow(self.handle,
910                                         ctypes.byref(ret)))
911        return ret.value
912
913    def num_col(self):
914        """Get the number of columns (features) in the DMatrix.
915
916        Returns
917        -------
918        number of columns : int
919        """
920        ret = c_bst_ulong()
921        _check_call(_LIB.XGDMatrixNumCol(self.handle, ctypes.byref(ret)))
922        return ret.value
923
924    def slice(
925        self, rindex: Union[List[int], np.ndarray], allow_groups: bool = False
926    ) -> "DMatrix":
927        """Slice the DMatrix and return a new DMatrix that only contains `rindex`.
928
929        Parameters
930        ----------
931        rindex
932            List of indices to be selected.
933        allow_groups
934            Allow slicing of a matrix with a groups attribute
935
936        Returns
937        -------
938        res
939            A new DMatrix containing only selected indices.
940        """
941        from .data import _maybe_np_slice
942
943        res = DMatrix(None)
944        res.handle = ctypes.c_void_p()
945        rindex = _maybe_np_slice(rindex, dtype=np.int32)
946        _check_call(
947            _LIB.XGDMatrixSliceDMatrixEx(
948                self.handle,
949                c_array(ctypes.c_int, rindex),
950                c_bst_ulong(len(rindex)),
951                ctypes.byref(res.handle),
952                ctypes.c_int(1 if allow_groups else 0),
953            )
954        )
955        return res
956
957    @property
958    def feature_names(self) -> Optional[List[str]]:
959        """Get feature names (column labels).
960
961        Returns
962        -------
963        feature_names : list or None
964        """
965        length = c_bst_ulong()
966        sarr = ctypes.POINTER(ctypes.c_char_p)()
967        _check_call(
968            _LIB.XGDMatrixGetStrFeatureInfo(
969                self.handle,
970                c_str("feature_name"),
971                ctypes.byref(length),
972                ctypes.byref(sarr),
973            )
974        )
975        feature_names = from_cstr_to_pystr(sarr, length)
976        if not feature_names:
977            return None
978        return feature_names
979
980    @feature_names.setter
981    def feature_names(self, feature_names: Optional[Union[List[str], str]]) -> None:
982        """Set feature names (column labels).
983
984        Parameters
985        ----------
986        feature_names : list or None
987            Labels for features. None will reset existing feature names
988        """
989        if feature_names is not None:
990            # validate feature name
991            try:
992                if not isinstance(feature_names, str):
993                    feature_names = list(feature_names)
994                else:
995                    feature_names = [feature_names]
996            except TypeError:
997                feature_names = [feature_names]
998
999            if len(feature_names) != len(set(feature_names)):
1000                raise ValueError('feature_names must be unique')
1001            if len(feature_names) != self.num_col() and self.num_col() != 0:
1002                msg = ("feature_names must have the same length as data, ",
1003                       f"expected {self.num_col()}, got {len(feature_names)}")
1004                raise ValueError(msg)
1005            # prohibit to use symbols may affect to parse. e.g. []<
1006            if not all(isinstance(f, str) and
1007                       not any(x in f for x in set(('[', ']', '<')))
1008                       for f in feature_names):
1009                raise ValueError('feature_names must be string, and may not contain [, ] or <')
1010            c_feature_names = [bytes(f, encoding='utf-8') for f in feature_names]
1011            c_feature_names = (ctypes.c_char_p *
1012                               len(c_feature_names))(*c_feature_names)
1013            _check_call(_LIB.XGDMatrixSetStrFeatureInfo(
1014                self.handle, c_str('feature_name'),
1015                c_feature_names,
1016                c_bst_ulong(len(feature_names))))
1017        else:
1018            # reset feature_types also
1019            _check_call(_LIB.XGDMatrixSetStrFeatureInfo(
1020                self.handle,
1021                c_str('feature_name'),
1022                None,
1023                c_bst_ulong(0)))
1024            self.feature_types = None
1025
1026    @property
1027    def feature_types(self) -> Optional[List[str]]:
1028        """Get feature types (column types).
1029
1030        Returns
1031        -------
1032        feature_types : list or None
1033        """
1034        length = c_bst_ulong()
1035        sarr = ctypes.POINTER(ctypes.c_char_p)()
1036        _check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
1037                                                    c_str('feature_type'),
1038                                                    ctypes.byref(length),
1039                                                    ctypes.byref(sarr)))
1040        res = from_cstr_to_pystr(sarr, length)
1041        if not res:
1042            return None
1043        return res
1044
1045    @feature_types.setter
1046    def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None:
1047        """Set feature types (column types).
1048
1049        This is for displaying the results and categorical data support.  See doc string
1050        of :py:obj:`xgboost.DMatrix` for details.
1051
1052        Parameters
1053        ----------
1054        feature_types : list or None
1055            Labels for features. None will reset existing feature names
1056
1057        """
1058        # For compatibility reason this function wraps single str input into a list.  But
1059        # we should not promote such usage since other than visualization, the field is
1060        # also used for specifying categorical data type.
1061        if feature_types is not None:
1062            if not isinstance(feature_types, (list, str)):
1063                raise TypeError(
1064                    'feature_types must be string or list of strings')
1065            if isinstance(feature_types, str):
1066                # single string will be applied to all columns
1067                feature_types = [feature_types] * self.num_col()
1068            try:
1069                if not isinstance(feature_types, str):
1070                    feature_types = list(feature_types)
1071                else:
1072                    feature_types = [feature_types]
1073            except TypeError:
1074                feature_types = [feature_types]
1075            c_feature_types = [bytes(f, encoding='utf-8')
1076                               for f in feature_types]
1077            c_feature_types = (ctypes.c_char_p *
1078                               len(c_feature_types))(*c_feature_types)
1079            _check_call(_LIB.XGDMatrixSetStrFeatureInfo(
1080                self.handle, c_str('feature_type'),
1081                c_feature_types,
1082                c_bst_ulong(len(feature_types))))
1083
1084            if len(feature_types) != self.num_col():
1085                msg = 'feature_types must have the same length as data'
1086                raise ValueError(msg)
1087        else:
1088            # Reset.
1089            _check_call(_LIB.XGDMatrixSetStrFeatureInfo(
1090                self.handle,
1091                c_str('feature_type'),
1092                None,
1093                c_bst_ulong(0)))
1094
1095
1096class _ProxyDMatrix(DMatrix):
1097    """A placeholder class when DMatrix cannot be constructed (DeviceQuantileDMatrix,
1098    inplace_predict).
1099
1100    """
1101
1102    def __init__(self):  # pylint: disable=super-init-not-called
1103        self.handle = ctypes.c_void_p()
1104        _check_call(_LIB.XGProxyDMatrixCreate(ctypes.byref(self.handle)))
1105
1106    def _set_data_from_cuda_interface(self, data) -> None:
1107        """Set data from CUDA array interface."""
1108        interface = data.__cuda_array_interface__
1109        interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
1110        _check_call(
1111            _LIB.XGProxyDMatrixSetDataCudaArrayInterface(self.handle, interface_str)
1112        )
1113
1114    def _set_data_from_cuda_columnar(self, data, cat_codes: list) -> None:
1115        """Set data from CUDA columnar format."""
1116        from .data import _cudf_array_interfaces
1117
1118        interfaces_str = _cudf_array_interfaces(data, cat_codes)
1119        _check_call(_LIB.XGProxyDMatrixSetDataCudaColumnar(self.handle, interfaces_str))
1120
1121    def _set_data_from_array(self, data: np.ndarray):
1122        """Set data from numpy array."""
1123        from .data import _array_interface
1124
1125        _check_call(
1126            _LIB.XGProxyDMatrixSetDataDense(self.handle, _array_interface(data))
1127        )
1128
1129    def _set_data_from_csr(self, csr):
1130        """Set data from scipy csr"""
1131        from .data import _array_interface
1132
1133        _LIB.XGProxyDMatrixSetDataCSR(
1134            self.handle,
1135            _array_interface(csr.indptr),
1136            _array_interface(csr.indices),
1137            _array_interface(csr.data),
1138            ctypes.c_size_t(csr.shape[1]),
1139        )
1140
1141
1142class DeviceQuantileDMatrix(DMatrix):
1143    """Device memory Data Matrix used in XGBoost for training with tree_method='gpu_hist'. Do
1144    not use this for test/validation tasks as some information may be lost in
1145    quantisation. This DMatrix is primarily designed to save memory in training from
1146    device memory inputs by avoiding intermediate storage. Set max_bin to control the
1147    number of bins during quantisation.  See doc string in :py:obj:`xgboost.DMatrix` for
1148    documents on meta info.
1149
1150    You can construct DeviceQuantileDMatrix from cupy/cudf/dlpack.
1151
1152    .. versionadded:: 1.1.0
1153
1154    """
1155
1156    @_deprecate_positional_args
1157    def __init__(  # pylint: disable=super-init-not-called
1158        self,
1159        data,
1160        label=None,
1161        *,
1162        weight=None,
1163        base_margin=None,
1164        missing=None,
1165        silent=False,
1166        feature_names=None,
1167        feature_types=None,
1168        nthread: Optional[int] = None,
1169        max_bin: int = 256,
1170        group=None,
1171        qid=None,
1172        label_lower_bound=None,
1173        label_upper_bound=None,
1174        feature_weights=None,
1175        enable_categorical: bool = False,
1176    ):
1177        self.max_bin = max_bin
1178        self.missing = missing if missing is not None else np.nan
1179        self.nthread = nthread if nthread is not None else 1
1180        self._silent = silent  # unused, kept for compatibility
1181
1182        if isinstance(data, ctypes.c_void_p):
1183            self.handle = data
1184            return
1185
1186        if qid is not None and group is not None:
1187            raise ValueError(
1188                'Only one of the eval_qid or eval_group for each evaluation '
1189                'dataset should be provided.'
1190            )
1191
1192        self._init(
1193            data,
1194            label=label,
1195            weight=weight,
1196            base_margin=base_margin,
1197            group=group,
1198            qid=qid,
1199            label_lower_bound=label_lower_bound,
1200            label_upper_bound=label_upper_bound,
1201            feature_weights=feature_weights,
1202            feature_names=feature_names,
1203            feature_types=feature_types,
1204            enable_categorical=enable_categorical,
1205        )
1206
1207    def _init(self, data, enable_categorical, **meta):
1208        from .data import (
1209            _is_dlpack,
1210            _transform_dlpack,
1211            _is_iter,
1212            SingleBatchInternalIter,
1213        )
1214
1215        if _is_dlpack(data):
1216            # We specialize for dlpack because cupy will take the memory from it so
1217            # it can't be transformed twice.
1218            data = _transform_dlpack(data)
1219        if _is_iter(data):
1220            it = data
1221        else:
1222            it = SingleBatchInternalIter(data=data, **meta)
1223
1224        handle = ctypes.c_void_p()
1225        # pylint: disable=protected-access
1226        reset_callback, next_callback = it._get_callbacks(False, enable_categorical)
1227        if it.cache_prefix is not None:
1228            raise ValueError(
1229                "DeviceQuantileDMatrix doesn't cache data, remove the cache_prefix "
1230                "in iterator to fix this error."
1231            )
1232        ret = _LIB.XGDeviceQuantileDMatrixCreateFromCallback(
1233            None,
1234            it.proxy.handle,
1235            reset_callback,
1236            next_callback,
1237            ctypes.c_float(self.missing),
1238            ctypes.c_int(self.nthread),
1239            ctypes.c_int(self.max_bin),
1240            ctypes.byref(handle),
1241        )
1242        # pylint: disable=protected-access
1243        it._reraise()
1244        # delay check_call to throw intermediate exception first
1245        _check_call(ret)
1246        self.handle = handle
1247
1248
1249Objective = Callable[[np.ndarray, DMatrix], Tuple[np.ndarray, np.ndarray]]
1250Metric = Callable[[np.ndarray, DMatrix], Tuple[str, float]]
1251
1252
1253def _get_booster_layer_trees(model: "Booster") -> Tuple[int, int]:
1254    """Get number of trees added to booster per-iteration.  This function will be removed
1255    once `best_ntree_limit` is dropped in favor of `best_iteration`.  Returns
1256    `num_parallel_tree` and `num_groups`.
1257
1258    """
1259    config = json.loads(model.save_config())
1260    booster = config["learner"]["gradient_booster"]["name"]
1261    if booster == "gblinear":
1262        num_parallel_tree = 0
1263    elif booster == "dart":
1264        num_parallel_tree = int(
1265            config["learner"]["gradient_booster"]["gbtree"]["gbtree_train_param"][
1266                "num_parallel_tree"
1267            ]
1268        )
1269    elif booster == "gbtree":
1270        num_parallel_tree = int(
1271            config["learner"]["gradient_booster"]["gbtree_train_param"][
1272                "num_parallel_tree"
1273            ]
1274        )
1275    else:
1276        raise ValueError(f"Unknown booster: {booster}")
1277    num_groups = int(config["learner"]["learner_model_param"]["num_class"])
1278    return num_parallel_tree, num_groups
1279
1280
1281class Booster(object):
1282    # pylint: disable=too-many-public-methods
1283    """A Booster of XGBoost.
1284
1285    Booster is the model of xgboost, that contains low level routines for
1286    training, prediction and evaluation.
1287    """
1288
1289    def __init__(self, params=None, cache=(), model_file=None):
1290        # pylint: disable=invalid-name
1291        """
1292        Parameters
1293        ----------
1294        params : dict
1295            Parameters for boosters.
1296        cache : list
1297            List of cache items.
1298        model_file : string/os.PathLike/Booster/bytearray
1299            Path to the model file if it's string or PathLike.
1300        """
1301        for d in cache:
1302            if not isinstance(d, DMatrix):
1303                raise TypeError(f'invalid cache item: {type(d).__name__}', cache)
1304
1305        dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
1306        self.handle = ctypes.c_void_p()
1307        _check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(len(cache)),
1308                                         ctypes.byref(self.handle)))
1309        for d in cache:
1310            # Validate feature only after the feature names are saved into booster.
1311            self._validate_features(d)
1312
1313        if isinstance(model_file, Booster):
1314            assert self.handle is not None
1315            # We use the pickle interface for getting memory snapshot from
1316            # another model, and load the snapshot with this booster.
1317            state = model_file.__getstate__()
1318            handle = state['handle']
1319            del state['handle']
1320            ptr = (ctypes.c_char * len(handle)).from_buffer(handle)
1321            length = c_bst_ulong(len(handle))
1322            _check_call(
1323                _LIB.XGBoosterUnserializeFromBuffer(self.handle, ptr, length))
1324            self.__dict__.update(state)
1325        elif isinstance(model_file, (STRING_TYPES, os.PathLike, bytearray)):
1326            self.load_model(model_file)
1327        elif model_file is None:
1328            pass
1329        else:
1330            raise TypeError('Unknown type:', model_file)
1331
1332        params = params or {}
1333        params = self._configure_metrics(params.copy())
1334        params = self._configure_constraints(params)
1335        if isinstance(params, list):
1336            params.append(('validate_parameters', True))
1337        else:
1338            params['validate_parameters'] = True
1339
1340        self.set_param(params or {})
1341        if (params is not None) and ('booster' in params):
1342            self.booster = params['booster']
1343        else:
1344            self.booster = 'gbtree'
1345
1346    def _configure_metrics(self, params: Union[Dict, List]) -> Union[Dict, List]:
1347        if isinstance(params, dict) and 'eval_metric' in params \
1348           and isinstance(params['eval_metric'], list):
1349            params = dict((k, v) for k, v in params.items())
1350            eval_metrics = params['eval_metric']
1351            params.pop("eval_metric", None)
1352            params = list(params.items())
1353            for eval_metric in eval_metrics:
1354                params += [('eval_metric', eval_metric)]
1355        return params
1356
1357    def _transform_monotone_constrains(self, value: Union[Dict[str, int], str]) -> str:
1358        if isinstance(value, str):
1359            return value
1360
1361        constrained_features = set(value.keys())
1362        if not constrained_features.issubset(set(self.feature_names or [])):
1363            raise ValueError('Constrained features are not a subset of '
1364                             'training data feature names')
1365
1366        return '(' + ','.join([str(value.get(feature_name, 0))
1367                               for feature_name in self.feature_names]) + ')'
1368
1369    def _transform_interaction_constraints(
1370        self, value: Union[List[Tuple[str]], str]
1371    ) -> str:
1372        if isinstance(value, str):
1373            return value
1374
1375        feature_idx_mapping = {k: str(v) for v, k in enumerate(self.feature_names or [])}
1376
1377        try:
1378            s = "["
1379            for constraint in value:
1380                s += (
1381                    "["
1382                    + ",".join(
1383                        [feature_idx_mapping[feature_name] for feature_name in constraint]
1384                    )
1385                    + "]"
1386                )
1387            return s + "]"
1388        except KeyError as e:
1389            # pylint: disable=raise-missing-from
1390            raise ValueError(
1391                "Constrained features are not a subset of training data feature names"
1392            ) from e
1393
1394    def _configure_constraints(self, params: Union[Dict, List]) -> Union[Dict, List]:
1395        if isinstance(params, dict):
1396            value = params.get("monotone_constraints")
1397            if value:
1398                params[
1399                    "monotone_constraints"
1400                ] = self._transform_monotone_constrains(value)
1401
1402            value = params.get("interaction_constraints")
1403            if value:
1404                params[
1405                    "interaction_constraints"
1406                ] = self._transform_interaction_constraints(value)
1407
1408        elif isinstance(params, list):
1409            for idx, param in enumerate(params):
1410                name, value = param
1411                if not value:
1412                    continue
1413
1414                if name == "monotone_constraints":
1415                    params[idx] = (name, self._transform_monotone_constrains(value))
1416                elif name == "interaction_constraints":
1417                    params[idx] = (name, self._transform_interaction_constraints(value))
1418
1419        return params
1420
1421    def __del__(self):
1422        if hasattr(self, 'handle') and self.handle is not None:
1423            _check_call(_LIB.XGBoosterFree(self.handle))
1424            self.handle = None
1425
1426    def __getstate__(self):
1427        # can't pickle ctypes pointers, put model content in bytearray
1428        this = self.__dict__.copy()
1429        handle = this['handle']
1430        if handle is not None:
1431            length = c_bst_ulong()
1432            cptr = ctypes.POINTER(ctypes.c_char)()
1433            _check_call(_LIB.XGBoosterSerializeToBuffer(self.handle,
1434                                                        ctypes.byref(length),
1435                                                        ctypes.byref(cptr)))
1436            buf = ctypes2buffer(cptr, length.value)
1437            this["handle"] = buf
1438        return this
1439
1440    def __setstate__(self, state):
1441        # reconstruct handle from raw data
1442        handle = state['handle']
1443        if handle is not None:
1444            buf = handle
1445            dmats = c_array(ctypes.c_void_p, [])
1446            handle = ctypes.c_void_p()
1447            _check_call(_LIB.XGBoosterCreate(
1448                dmats, c_bst_ulong(0), ctypes.byref(handle)))
1449            length = c_bst_ulong(len(buf))
1450            ptr = (ctypes.c_char * len(buf)).from_buffer(buf)
1451            _check_call(
1452                _LIB.XGBoosterUnserializeFromBuffer(handle, ptr, length))
1453            state['handle'] = handle
1454        self.__dict__.update(state)
1455
1456    def __getitem__(self, val):
1457        if isinstance(val, int):
1458            val = slice(val, val+1)
1459        if isinstance(val, tuple):
1460            raise ValueError('Only supports slicing through 1 dimension.')
1461        if not isinstance(val, slice):
1462            msg = _expect((int, slice), type(val))
1463            raise TypeError(msg)
1464        if isinstance(val.start, type(Ellipsis)) or val.start is None:
1465            start = 0
1466        else:
1467            start = val.start
1468        if isinstance(val.stop, type(Ellipsis)) or val.stop is None:
1469            stop = 0
1470        else:
1471            stop = val.stop
1472            if stop < start:
1473                raise ValueError('Invalid slice', val)
1474
1475        step = val.step if val.step is not None else 1
1476
1477        start = ctypes.c_int(start)
1478        stop = ctypes.c_int(stop)
1479        step = ctypes.c_int(step)
1480
1481        sliced_handle = ctypes.c_void_p()
1482        status = _LIB.XGBoosterSlice(self.handle, start, stop, step,
1483                                     ctypes.byref(sliced_handle))
1484        if status == -2:
1485            raise IndexError('Layer index out of range')
1486        _check_call(status)
1487
1488        sliced = Booster()
1489        _check_call(_LIB.XGBoosterFree(sliced.handle))
1490        sliced.handle = sliced_handle
1491        return sliced
1492
1493    def save_config(self):
1494        '''Output internal parameter configuration of Booster as a JSON
1495        string.
1496
1497        .. versionadded:: 1.0.0
1498        '''
1499        json_string = ctypes.c_char_p()
1500        length = c_bst_ulong()
1501        _check_call(_LIB.XGBoosterSaveJsonConfig(
1502            self.handle,
1503            ctypes.byref(length),
1504            ctypes.byref(json_string)))
1505        json_string = json_string.value.decode()  # pylint: disable=no-member
1506        return json_string
1507
1508    def load_config(self, config):
1509        '''Load configuration returned by `save_config`.
1510
1511        .. versionadded:: 1.0.0
1512        '''
1513        assert isinstance(config, str)
1514        _check_call(_LIB.XGBoosterLoadJsonConfig(
1515            self.handle,
1516            c_str(config)))
1517
1518    def __copy__(self):
1519        return self.__deepcopy__(None)
1520
1521    def __deepcopy__(self, _):
1522        '''Return a copy of booster.'''
1523        return Booster(model_file=self)
1524
1525    def copy(self):
1526        """Copy the booster object.
1527
1528        Returns
1529        -------
1530        booster: `Booster`
1531            a copied booster model
1532        """
1533        return self.__copy__()
1534
1535    def attr(self, key):
1536        """Get attribute string from the Booster.
1537
1538        Parameters
1539        ----------
1540        key : str
1541            The key to get attribute from.
1542
1543        Returns
1544        -------
1545        value : str
1546            The attribute value of the key, returns None if attribute do not exist.
1547        """
1548        ret = ctypes.c_char_p()
1549        success = ctypes.c_int()
1550        _check_call(_LIB.XGBoosterGetAttr(
1551            self.handle, c_str(key), ctypes.byref(ret), ctypes.byref(success)))
1552        if success.value != 0:
1553            return py_str(ret.value)
1554        return None
1555
1556    def attributes(self):
1557        """Get attributes stored in the Booster as a dictionary.
1558
1559        Returns
1560        -------
1561        result : dictionary of  attribute_name: attribute_value pairs of strings.
1562            Returns an empty dict if there's no attributes.
1563        """
1564        length = c_bst_ulong()
1565        sarr = ctypes.POINTER(ctypes.c_char_p)()
1566        _check_call(_LIB.XGBoosterGetAttrNames(self.handle,
1567                                               ctypes.byref(length),
1568                                               ctypes.byref(sarr)))
1569        attr_names = from_cstr_to_pystr(sarr, length)
1570        return {n: self.attr(n) for n in attr_names}
1571
1572    def set_attr(self, **kwargs: Optional[str]) -> None:
1573        """Set the attribute of the Booster.
1574
1575        Parameters
1576        ----------
1577        **kwargs
1578            The attributes to set. Setting a value to None deletes an attribute.
1579        """
1580        for key, value in kwargs.items():
1581            if value is not None:
1582                if not isinstance(value, STRING_TYPES):
1583                    raise ValueError("Set Attr only accepts string values")
1584                value = c_str(str(value))
1585            _check_call(_LIB.XGBoosterSetAttr(
1586                self.handle, c_str(key), value))
1587
1588    def _get_feature_info(self, field: str):
1589        length = c_bst_ulong()
1590        sarr = ctypes.POINTER(ctypes.c_char_p)()
1591        if not hasattr(self, "handle") or self.handle is None:
1592            return None
1593        _check_call(
1594            _LIB.XGBoosterGetStrFeatureInfo(
1595                self.handle, c_str(field), ctypes.byref(length), ctypes.byref(sarr),
1596            )
1597        )
1598        feature_info = from_cstr_to_pystr(sarr, length)
1599        return feature_info if feature_info else None
1600
1601    @property
1602    def feature_types(self) -> Optional[List[str]]:
1603        """Feature types for this booster.  Can be directly set by input data or by
1604        assignment.
1605
1606        """
1607        return self._get_feature_info("feature_type")
1608
1609    @property
1610    def feature_names(self) -> Optional[List[str]]:
1611        """Feature names for this booster.  Can be directly set by input data or by
1612        assignment.
1613
1614        """
1615        return self._get_feature_info("feature_name")
1616
1617    def _set_feature_info(self, features: Optional[List[str]], field: str) -> None:
1618        if features is not None:
1619            assert isinstance(features, list)
1620            c_feature_info = [bytes(f, encoding="utf-8") for f in features]
1621            c_feature_info = (ctypes.c_char_p * len(c_feature_info))(*c_feature_info)
1622            _check_call(
1623                _LIB.XGBoosterSetStrFeatureInfo(
1624                    self.handle, c_str(field), c_feature_info, c_bst_ulong(len(features))
1625                )
1626            )
1627        else:
1628            _check_call(
1629                _LIB.XGBoosterSetStrFeatureInfo(
1630                    self.handle, c_str(field), None, c_bst_ulong(0)
1631                )
1632            )
1633
1634    @feature_names.setter
1635    def feature_names(self, features: Optional[List[str]]) -> None:
1636        self._set_feature_info(features, "feature_name")
1637
1638    @feature_types.setter
1639    def feature_types(self, features: Optional[List[str]]) -> None:
1640        self._set_feature_info(features, "feature_type")
1641
1642    def set_param(self, params, value=None):
1643        """Set parameters into the Booster.
1644
1645        Parameters
1646        ----------
1647        params: dict/list/str
1648           list of key,value pairs, dict of key to value or simply str key
1649        value: optional
1650           value of the specified parameter, when params is str key
1651        """
1652        if isinstance(params, Mapping):
1653            params = params.items()
1654        elif isinstance(params, STRING_TYPES) and value is not None:
1655            params = [(params, value)]
1656        for key, val in params:
1657            if val is not None:
1658                _check_call(_LIB.XGBoosterSetParam(self.handle, c_str(key),
1659                                                   c_str(str(val))))
1660
1661    def update(self, dtrain, iteration, fobj=None):
1662        """Update for one iteration, with objective function calculated
1663        internally.  This function should not be called directly by users.
1664
1665        Parameters
1666        ----------
1667        dtrain : DMatrix
1668            Training data.
1669        iteration : int
1670            Current iteration number.
1671        fobj : function
1672            Customized objective function.
1673
1674        """
1675        if not isinstance(dtrain, DMatrix):
1676            raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
1677        self._validate_features(dtrain)
1678
1679        if fobj is None:
1680            _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
1681                                                    ctypes.c_int(iteration),
1682                                                    dtrain.handle))
1683        else:
1684            pred = self.predict(dtrain, output_margin=True, training=True)
1685            grad, hess = fobj(pred, dtrain)
1686            self.boost(dtrain, grad, hess)
1687
1688    def boost(self, dtrain, grad, hess):
1689        """Boost the booster for one iteration, with customized gradient
1690        statistics.  Like :py:func:`xgboost.Booster.update`, this
1691        function should not be called directly by users.
1692
1693        Parameters
1694        ----------
1695        dtrain : DMatrix
1696            The training DMatrix.
1697        grad : list
1698            The first order of gradient.
1699        hess : list
1700            The second order of gradient.
1701
1702        """
1703        if len(grad) != len(hess):
1704            raise ValueError(
1705                f"grad / hess length mismatch: {len(grad)} / {len(hess)}"
1706            )
1707        if not isinstance(dtrain, DMatrix):
1708            raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
1709        self._validate_features(dtrain)
1710
1711        _check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle,
1712                                               c_array(ctypes.c_float, grad),
1713                                               c_array(ctypes.c_float, hess),
1714                                               c_bst_ulong(len(grad))))
1715
1716    def eval_set(self, evals, iteration=0, feval=None):
1717        # pylint: disable=invalid-name
1718        """Evaluate a set of data.
1719
1720        Parameters
1721        ----------
1722        evals : list of tuples (DMatrix, string)
1723            List of items to be evaluated.
1724        iteration : int
1725            Current iteration.
1726        feval : function
1727            Custom evaluation function.
1728
1729        Returns
1730        -------
1731        result: str
1732            Evaluation result string.
1733        """
1734        for d in evals:
1735            if not isinstance(d[0], DMatrix):
1736                raise TypeError(f"expected DMatrix, got {type(d[0]).__name__}")
1737            if not isinstance(d[1], STRING_TYPES):
1738                raise TypeError(f"expected string, got {type(d[1]).__name__}")
1739            self._validate_features(d[0])
1740
1741        dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals])
1742        evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals])
1743        msg = ctypes.c_char_p()
1744        _check_call(_LIB.XGBoosterEvalOneIter(self.handle,
1745                                              ctypes.c_int(iteration),
1746                                              dmats, evnames,
1747                                              c_bst_ulong(len(evals)),
1748                                              ctypes.byref(msg)))
1749        res = msg.value.decode()  # pylint: disable=no-member
1750        if feval is not None:
1751            for dmat, evname in evals:
1752                feval_ret = feval(self.predict(dmat, training=False,
1753                                               output_margin=True), dmat)
1754                if isinstance(feval_ret, list):
1755                    for name, val in feval_ret:
1756                        # pylint: disable=consider-using-f-string
1757                        res += '\t%s-%s:%f' % (evname, name, val)
1758                else:
1759                    name, val = feval_ret
1760                    # pylint: disable=consider-using-f-string
1761                    res += '\t%s-%s:%f' % (evname, name, val)
1762        return res
1763
1764    def eval(self, data, name='eval', iteration=0):
1765        """Evaluate the model on mat.
1766
1767        Parameters
1768        ----------
1769        data : DMatrix
1770            The dmatrix storing the input.
1771
1772        name : str, optional
1773            The name of the dataset.
1774
1775        iteration : int, optional
1776            The current iteration number.
1777
1778        Returns
1779        -------
1780        result: str
1781            Evaluation result string.
1782        """
1783        self._validate_features(data)
1784        return self.eval_set([(data, name)], iteration)
1785
1786    # pylint: disable=too-many-function-args
1787    def predict(
1788        self,
1789        data: DMatrix,
1790        output_margin: bool = False,
1791        ntree_limit: int = 0,
1792        pred_leaf: bool = False,
1793        pred_contribs: bool = False,
1794        approx_contribs: bool = False,
1795        pred_interactions: bool = False,
1796        validate_features: bool = True,
1797        training: bool = False,
1798        iteration_range: Tuple[int, int] = (0, 0),
1799        strict_shape: bool = False,
1800    ) -> np.ndarray:
1801        """Predict with data.  The full model will be used unless `iteration_range` is specified,
1802        meaning user have to either slice the model or use the ``best_iteration``
1803        attribute to get prediction from best model returned from early stopping.
1804
1805        .. note::
1806
1807            See `Prediction
1808            <https://xgboost.readthedocs.io/en/latest/prediction.html>`_
1809            for issues like thread safety and a summary of outputs from this function.
1810
1811        Parameters
1812        ----------
1813        data :
1814            The dmatrix storing the input.
1815
1816        output_margin :
1817            Whether to output the raw untransformed margin value.
1818
1819        ntree_limit :
1820            Deprecated, use `iteration_range` instead.
1821
1822        pred_leaf :
1823            When this option is on, the output will be a matrix of (nsample,
1824            ntrees) with each record indicating the predicted leaf index of
1825            each sample in each tree.  Note that the leaf index of a tree is
1826            unique per tree, so you may find leaf 1 in both tree 1 and tree 0.
1827
1828        pred_contribs :
1829            When this is True the output will be a matrix of size (nsample,
1830            nfeats + 1) with each record indicating the feature contributions
1831            (SHAP values) for that prediction. The sum of all feature
1832            contributions is equal to the raw untransformed margin value of the
1833            prediction. Note the final column is the bias term.
1834
1835        approx_contribs :
1836            Approximate the contributions of each feature.  Used when ``pred_contribs`` or
1837            ``pred_interactions`` is set to True.  Changing the default of this parameter
1838            (False) is not recommended.
1839
1840        pred_interactions :
1841            When this is True the output will be a matrix of size (nsample,
1842            nfeats + 1, nfeats + 1) indicating the SHAP interaction values for
1843            each pair of features. The sum of each row (or column) of the
1844            interaction values equals the corresponding SHAP value (from
1845            pred_contribs), and the sum of the entire matrix equals the raw
1846            untransformed margin value of the prediction. Note the last row and
1847            column correspond to the bias term.
1848
1849        validate_features :
1850            When this is True, validate that the Booster's and data's
1851            feature_names are identical.  Otherwise, it is assumed that the
1852            feature_names are the same.
1853
1854        training :
1855            Whether the prediction value is used for training.  This can effect `dart`
1856            booster, which performs dropouts during training iterations but use all trees
1857            for inference. If you want to obtain result with dropouts, set this parameter
1858            to `True`.  Also, the parameter is set to true when obtaining prediction for
1859            custom objective function.
1860
1861            .. versionadded:: 1.0.0
1862
1863        iteration_range :
1864            Specifies which layer of trees are used in prediction.  For example, if a
1865            random forest is trained with 100 rounds.  Specifying `iteration_range=(10,
1866            20)`, then only the forests built during [10, 20) (half open set) rounds are
1867            used in this prediction.
1868
1869            .. versionadded:: 1.4.0
1870
1871        strict_shape :
1872            When set to True, output shape is invariant to whether classification is used.
1873            For both value and margin prediction, the output shape is (n_samples,
1874            n_groups), n_groups == 1 when multi-class is not used.  Default to False, in
1875            which case the output shape can be (n_samples, ) if multi-class is not used.
1876
1877            .. versionadded:: 1.4.0
1878
1879        Returns
1880        -------
1881        prediction : numpy array
1882
1883        """
1884        if not isinstance(data, DMatrix):
1885            raise TypeError('Expecting data to be a DMatrix object, got: ', type(data))
1886        if validate_features:
1887            self._validate_features(data)
1888        iteration_range = _convert_ntree_limit(self, ntree_limit, iteration_range)
1889        args = {
1890            "type": 0,
1891            "training": training,
1892            "iteration_begin": iteration_range[0],
1893            "iteration_end": iteration_range[1],
1894            "strict_shape": strict_shape,
1895        }
1896
1897        def assign_type(t: int) -> None:
1898            if args["type"] != 0:
1899                raise ValueError("One type of prediction at a time.")
1900            args["type"] = t
1901
1902        if output_margin:
1903            assign_type(1)
1904        if pred_contribs:
1905            assign_type(2 if not approx_contribs else 3)
1906        if pred_interactions:
1907            assign_type(4 if not approx_contribs else 5)
1908        if pred_leaf:
1909            assign_type(6)
1910        preds = ctypes.POINTER(ctypes.c_float)()
1911        shape = ctypes.POINTER(c_bst_ulong)()
1912        dims = c_bst_ulong()
1913        _check_call(
1914            _LIB.XGBoosterPredictFromDMatrix(
1915                self.handle,
1916                data.handle,
1917                from_pystr_to_cstr(json.dumps(args)),
1918                ctypes.byref(shape),
1919                ctypes.byref(dims),
1920                ctypes.byref(preds)
1921            )
1922        )
1923        return _prediction_output(shape, dims, preds, False)
1924
1925    def inplace_predict(
1926        self,
1927        data: Any,
1928        iteration_range: Tuple[int, int] = (0, 0),
1929        predict_type: str = "value",
1930        missing: float = np.nan,
1931        validate_features: bool = True,
1932        base_margin: Any = None,
1933        strict_shape: bool = False
1934    ):
1935        """Run prediction in-place, Unlike ``predict`` method, inplace prediction does
1936        not cache the prediction result.
1937
1938        Calling only ``inplace_predict`` in multiple threads is safe and lock
1939        free.  But the safety does not hold when used in conjunction with other
1940        methods. E.g. you can't train the booster in one thread and perform
1941        prediction in the other.
1942
1943        .. code-block:: python
1944
1945            booster.set_param({'predictor': 'gpu_predictor'})
1946            booster.inplace_predict(cupy_array)
1947
1948            booster.set_param({'predictor': 'cpu_predictor})
1949            booster.inplace_predict(numpy_array)
1950
1951        .. versionadded:: 1.1.0
1952
1953        Parameters
1954        ----------
1955        data : numpy.ndarray/scipy.sparse.csr_matrix/cupy.ndarray/
1956               cudf.DataFrame/pd.DataFrame
1957            The input data, must not be a view for numpy array.  Set
1958            ``predictor`` to ``gpu_predictor`` for running prediction on CuPy
1959            array or CuDF DataFrame.
1960        iteration_range :
1961            See :py:meth:`xgboost.Booster.predict` for details.
1962        predict_type :
1963            * `value` Output model prediction values.
1964            * `margin` Output the raw untransformed margin value.
1965        missing :
1966            See :py:obj:`xgboost.DMatrix` for details.
1967        validate_features:
1968            See :py:meth:`xgboost.Booster.predict` for details.
1969        base_margin:
1970            See :py:obj:`xgboost.DMatrix` for details.
1971
1972            .. versionadded:: 1.4.0
1973
1974        strict_shape:
1975            See :py:meth:`xgboost.Booster.predict` for details.
1976
1977            .. versionadded:: 1.4.0
1978
1979        Returns
1980        -------
1981        prediction : numpy.ndarray/cupy.ndarray
1982            The prediction result.  When input data is on GPU, prediction
1983            result is stored in a cupy array.
1984
1985        """
1986        preds = ctypes.POINTER(ctypes.c_float)()
1987
1988        # once caching is supported, we can pass id(data) as cache id.
1989        args = {
1990            "type": 0,
1991            "training": False,
1992            "iteration_begin": iteration_range[0],
1993            "iteration_end": iteration_range[1],
1994            "missing": missing,
1995            "strict_shape": strict_shape,
1996            "cache_id": 0,
1997        }
1998        if predict_type == "margin":
1999            args["type"] = 1
2000        shape = ctypes.POINTER(c_bst_ulong)()
2001        dims = c_bst_ulong()
2002
2003        if base_margin is not None:
2004            proxy: Optional[_ProxyDMatrix] = _ProxyDMatrix()
2005            assert proxy is not None
2006            proxy.set_info(base_margin=base_margin)
2007            p_handle = proxy.handle
2008        else:
2009            proxy = None
2010            p_handle = ctypes.c_void_p()
2011        assert proxy is None or isinstance(proxy, _ProxyDMatrix)
2012        if validate_features:
2013            if not hasattr(data, "shape"):
2014                raise TypeError(
2015                    "`shape` attribute is required when `validate_features` is True."
2016                )
2017            if len(data.shape) != 1 and self.num_features() != data.shape[1]:
2018                raise ValueError(
2019                    f"Feature shape mismatch, expected: {self.num_features()}, "
2020                    f"got {data.shape[1]}"
2021                )
2022
2023        from .data import _is_pandas_df, _transform_pandas_df
2024        from .data import _array_interface
2025        if (
2026            _is_pandas_df(data)
2027            or lazy_isinstance(data, "cudf.core.dataframe", "DataFrame")
2028        ):
2029            ft = self.feature_types
2030            if ft is None:
2031                enable_categorical = False
2032            else:
2033                enable_categorical = any(f == "c" for f in ft)
2034        if _is_pandas_df(data):
2035            data, _, _ = _transform_pandas_df(data, enable_categorical)
2036
2037        if isinstance(data, np.ndarray):
2038            from .data import _ensure_np_dtype
2039            data, _ = _ensure_np_dtype(data, data.dtype)
2040            _check_call(
2041                _LIB.XGBoosterPredictFromDense(
2042                    self.handle,
2043                    _array_interface(data),
2044                    from_pystr_to_cstr(json.dumps(args)),
2045                    p_handle,
2046                    ctypes.byref(shape),
2047                    ctypes.byref(dims),
2048                    ctypes.byref(preds),
2049                )
2050            )
2051            return _prediction_output(shape, dims, preds, False)
2052        if isinstance(data, scipy.sparse.csr_matrix):
2053            csr = data
2054            _check_call(
2055                _LIB.XGBoosterPredictFromCSR(
2056                    self.handle,
2057                    _array_interface(csr.indptr),
2058                    _array_interface(csr.indices),
2059                    _array_interface(csr.data),
2060                    ctypes.c_size_t(csr.shape[1]),
2061                    from_pystr_to_cstr(json.dumps(args)),
2062                    p_handle,
2063                    ctypes.byref(shape),
2064                    ctypes.byref(dims),
2065                    ctypes.byref(preds),
2066                )
2067            )
2068            return _prediction_output(shape, dims, preds, False)
2069        if lazy_isinstance(data, "cupy.core.core", "ndarray") or lazy_isinstance(
2070            data, "cupy._core.core", "ndarray"
2071        ):
2072            from .data import _transform_cupy_array
2073
2074            data = _transform_cupy_array(data)
2075            interface_str = _cuda_array_interface(data)
2076            _check_call(
2077                _LIB.XGBoosterPredictFromCudaArray(
2078                    self.handle,
2079                    interface_str,
2080                    from_pystr_to_cstr(json.dumps(args)),
2081                    p_handle,
2082                    ctypes.byref(shape),
2083                    ctypes.byref(dims),
2084                    ctypes.byref(preds),
2085                )
2086            )
2087            return _prediction_output(shape, dims, preds, True)
2088        if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"):
2089            from .data import _cudf_array_interfaces, _transform_cudf_df
2090            data, cat_codes, _, _ = _transform_cudf_df(
2091                data, None, None, enable_categorical
2092            )
2093            interfaces_str = _cudf_array_interfaces(data, cat_codes)
2094            _check_call(
2095                _LIB.XGBoosterPredictFromCudaColumnar(
2096                    self.handle,
2097                    interfaces_str,
2098                    from_pystr_to_cstr(json.dumps(args)),
2099                    p_handle,
2100                    ctypes.byref(shape),
2101                    ctypes.byref(dims),
2102                    ctypes.byref(preds),
2103                )
2104            )
2105            return _prediction_output(shape, dims, preds, True)
2106
2107        raise TypeError(
2108            "Data type:" + str(type(data)) + " not supported by inplace prediction."
2109        )
2110
2111    def save_model(self, fname: Union[str, os.PathLike]):
2112        """Save the model to a file.
2113
2114        The model is saved in an XGBoost internal format which is universal among the
2115        various XGBoost interfaces. Auxiliary attributes of the Python Booster object
2116        (such as feature_names) will not be saved when using binary format.  To save those
2117        attributes, use JSON instead. See: `Model IO
2118        <https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html>`_ for more
2119        info.
2120
2121        Parameters
2122        ----------
2123        fname : string or os.PathLike
2124            Output file name
2125
2126        """
2127        if isinstance(fname, (STRING_TYPES, os.PathLike)):  # assume file name
2128            fname = os.fspath(os.path.expanduser(fname))
2129            _check_call(_LIB.XGBoosterSaveModel(
2130                self.handle, c_str(fname)))
2131        else:
2132            raise TypeError("fname must be a string or os PathLike")
2133
2134    def save_raw(self):
2135        """Save the model to a in memory buffer representation instead of file.
2136
2137        Returns
2138        -------
2139        a in memory buffer representation of the model
2140        """
2141        length = c_bst_ulong()
2142        cptr = ctypes.POINTER(ctypes.c_char)()
2143        _check_call(_LIB.XGBoosterGetModelRaw(self.handle,
2144                                              ctypes.byref(length),
2145                                              ctypes.byref(cptr)))
2146        return ctypes2buffer(cptr, length.value)
2147
2148    def load_model(self, fname: Union[str, bytearray, os.PathLike]) -> None:
2149        """Load the model from a file or bytearray. Path to file can be local
2150        or as an URI.
2151
2152        The model is loaded from XGBoost format which is universal among the various
2153        XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
2154        feature_names) will not be loaded when using binary format.  To save those
2155        attributes, use JSON instead.  See: `Model IO
2156        <https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html>`_ for more
2157        info.
2158
2159        Parameters
2160        ----------
2161        fname :
2162            Input file name or memory buffer(see also save_raw)
2163
2164        """
2165        if isinstance(fname, (str, os.PathLike)):
2166            # assume file name, cannot use os.path.exist to check, file can be
2167            # from URL.
2168            fname = os.fspath(os.path.expanduser(fname))
2169            _check_call(_LIB.XGBoosterLoadModel(
2170                self.handle, c_str(fname)))
2171        elif isinstance(fname, bytearray):
2172            buf = fname
2173            length = c_bst_ulong(len(buf))
2174            ptr = (ctypes.c_char * len(buf)).from_buffer(buf)
2175            _check_call(_LIB.XGBoosterLoadModelFromBuffer(self.handle, ptr,
2176                                                          length))
2177        else:
2178            raise TypeError('Unknown file type: ', fname)
2179
2180        if self.attr("best_iteration") is not None:
2181            self.best_iteration = int(self.attr("best_iteration"))
2182        if self.attr("best_score") is not None:
2183            self.best_score = float(self.attr("best_score"))
2184        if self.attr("best_ntree_limit") is not None:
2185            self.best_ntree_limit = int(self.attr("best_ntree_limit"))
2186
2187    def num_boosted_rounds(self) -> int:
2188        '''Get number of boosted rounds.  For gblinear this is reset to 0 after
2189        serializing the model.
2190
2191        '''
2192        rounds = ctypes.c_int()
2193        assert self.handle is not None
2194        _check_call(_LIB.XGBoosterBoostedRounds(self.handle, ctypes.byref(rounds)))
2195        return rounds.value
2196
2197    def num_features(self) -> int:
2198        '''Number of features in booster.'''
2199        features = ctypes.c_int()
2200        assert self.handle is not None
2201        _check_call(_LIB.XGBoosterGetNumFeature(self.handle, ctypes.byref(features)))
2202        return features.value
2203
2204    def dump_model(self, fout, fmap='', with_stats=False, dump_format="text"):
2205        """Dump model into a text or JSON file.  Unlike `save_model`, the
2206        output format is primarily used for visualization or interpretation,
2207        hence it's more human readable but cannot be loaded back to XGBoost.
2208
2209        Parameters
2210        ----------
2211        fout : string or os.PathLike
2212            Output file name.
2213        fmap : string or os.PathLike, optional
2214            Name of the file containing feature map names.
2215        with_stats : bool, optional
2216            Controls whether the split statistics are output.
2217        dump_format : string, optional
2218            Format of model dump file. Can be 'text' or 'json'.
2219        """
2220        if isinstance(fout, (STRING_TYPES, os.PathLike)):
2221            fout = os.fspath(os.path.expanduser(fout))
2222            # pylint: disable=consider-using-with
2223            fout = open(fout, 'w', encoding="utf-8")
2224            need_close = True
2225        else:
2226            need_close = False
2227        ret = self.get_dump(fmap, with_stats, dump_format)
2228        if dump_format == 'json':
2229            fout.write('[\n')
2230            for i, _ in enumerate(ret):
2231                fout.write(ret[i])
2232                if i < len(ret) - 1:
2233                    fout.write(",\n")
2234            fout.write('\n]')
2235        else:
2236            for i, _ in enumerate(ret):
2237                fout.write(f"booster[{i}]:\n")
2238                fout.write(ret[i])
2239        if need_close:
2240            fout.close()
2241
2242    def get_dump(self, fmap='', with_stats=False, dump_format="text"):
2243        """Returns the model dump as a list of strings.  Unlike `save_model`, the
2244        output format is primarily used for visualization or interpretation,
2245        hence it's more human readable but cannot be loaded back to XGBoost.
2246
2247        Parameters
2248        ----------
2249        fmap : string or os.PathLike, optional
2250            Name of the file containing feature map names.
2251        with_stats : bool, optional
2252            Controls whether the split statistics are output.
2253        dump_format : string, optional
2254            Format of model dump. Can be 'text', 'json' or 'dot'.
2255
2256        """
2257        fmap = os.fspath(os.path.expanduser(fmap))
2258        length = c_bst_ulong()
2259        sarr = ctypes.POINTER(ctypes.c_char_p)()
2260        _check_call(_LIB.XGBoosterDumpModelEx(self.handle,
2261                                              c_str(fmap),
2262                                              ctypes.c_int(with_stats),
2263                                              c_str(dump_format),
2264                                              ctypes.byref(length),
2265                                              ctypes.byref(sarr)))
2266        res = from_cstr_to_pystr(sarr, length)
2267        return res
2268
2269    def get_fscore(self, fmap=''):
2270        """Get feature importance of each feature.
2271
2272        .. note:: Zero-importance features will not be included
2273
2274           Keep in mind that this function does not include zero-importance feature, i.e.
2275           those features that have not been used in any split conditions.
2276
2277        Parameters
2278        ----------
2279        fmap: str or os.PathLike (optional)
2280           The name of feature map file
2281        """
2282
2283        return self.get_score(fmap, importance_type='weight')
2284
2285    def get_score(
2286        self, fmap: Union[str, os.PathLike] = '', importance_type: str = 'weight'
2287    ) -> Dict[str, Union[float, List[float]]]:
2288        """Get feature importance of each feature.
2289        For tree model Importance type can be defined as:
2290
2291        * 'weight': the number of times a feature is used to split the data across all trees.
2292        * 'gain': the average gain across all splits the feature is used in.
2293        * 'cover': the average coverage across all splits the feature is used in.
2294        * 'total_gain': the total gain across all splits the feature is used in.
2295        * 'total_cover': the total coverage across all splits the feature is used in.
2296
2297        .. note::
2298
2299           For linear model, only "weight" is defined and it's the normalized coefficients
2300           without bias.
2301
2302        .. note:: Zero-importance features will not be included
2303
2304           Keep in mind that this function does not include zero-importance feature, i.e.
2305           those features that have not been used in any split conditions.
2306
2307        Parameters
2308        ----------
2309        fmap: str or os.PathLike (optional)
2310           The name of feature map file.
2311        importance_type: str, default 'weight'
2312            One of the importance types defined above.
2313
2314        Returns
2315        -------
2316        A map between feature names and their scores.  When `gblinear` is used for
2317        multi-class classification the scores for each feature is a list with length
2318        `n_classes`, otherwise they're scalars.
2319        """
2320        fmap = os.fspath(os.path.expanduser(fmap))
2321        args = from_pystr_to_cstr(
2322            json.dumps({"importance_type": importance_type, "feature_map": fmap})
2323        )
2324        features = ctypes.POINTER(ctypes.c_char_p)()
2325        scores = ctypes.POINTER(ctypes.c_float)()
2326        n_out_features = c_bst_ulong()
2327        out_dim = c_bst_ulong()
2328        shape = ctypes.POINTER(c_bst_ulong)()
2329
2330        _check_call(
2331            _LIB.XGBoosterFeatureScore(
2332                self.handle,
2333                args,
2334                ctypes.byref(n_out_features),
2335                ctypes.byref(features),
2336                ctypes.byref(out_dim),
2337                ctypes.byref(shape),
2338                ctypes.byref(scores),
2339            )
2340        )
2341        features_arr = from_cstr_to_pystr(features, n_out_features)
2342        scores_arr = _prediction_output(shape, out_dim, scores, False)
2343
2344        results: Dict[str, Union[float, List[float]]] = {}
2345        if len(scores_arr.shape) > 1 and scores_arr.shape[1] > 1:
2346            for feat, score in zip(features_arr, scores_arr):
2347                results[feat] = [float(s) for s in score]
2348        else:
2349            for feat, score in zip(features_arr, scores_arr):
2350                results[feat] = float(score)
2351        return results
2352
2353    def trees_to_dataframe(self, fmap=''):  # pylint: disable=too-many-statements
2354        """Parse a boosted tree model text dump into a pandas DataFrame structure.
2355
2356        This feature is only defined when the decision tree model is chosen as base
2357        learner (`booster in {gbtree, dart}`). It is not defined for other base learner
2358        types, such as linear learners (`booster=gblinear`).
2359
2360        Parameters
2361        ----------
2362        fmap: str or os.PathLike (optional)
2363           The name of feature map file.
2364        """
2365        # pylint: disable=too-many-locals
2366        fmap = os.fspath(os.path.expanduser(fmap))
2367        if not PANDAS_INSTALLED:
2368            raise ImportError(('pandas must be available to use this method.'
2369                               'Install pandas before calling again.'))
2370
2371        if getattr(self, 'booster', None) is not None and self.booster not in {'gbtree', 'dart'}:
2372            raise ValueError(
2373                f"This method is not defined for Booster type {self.booster}"
2374            )
2375
2376        tree_ids = []
2377        node_ids = []
2378        fids = []
2379        splits = []
2380        categories = []
2381        y_directs = []
2382        n_directs = []
2383        missings = []
2384        gains = []
2385        covers = []
2386
2387        trees = self.get_dump(fmap, with_stats=True)
2388        for i, tree in enumerate(trees):
2389            for line in tree.split('\n'):
2390                arr = line.split('[')
2391                # Leaf node
2392                if len(arr) == 1:
2393                    # Last element of line.split is an empy string
2394                    if arr == ['']:
2395                        continue
2396                    # parse string
2397                    parse = arr[0].split(':')
2398                    stats = re.split('=|,', parse[1])
2399
2400                    # append to lists
2401                    tree_ids.append(i)
2402                    node_ids.append(int(re.findall(r'\b\d+\b', parse[0])[0]))
2403                    fids.append('Leaf')
2404                    splits.append(float('NAN'))
2405                    categories.append(float('NAN'))
2406                    y_directs.append(float('NAN'))
2407                    n_directs.append(float('NAN'))
2408                    missings.append(float('NAN'))
2409                    gains.append(float(stats[1]))
2410                    covers.append(float(stats[3]))
2411                # Not a Leaf Node
2412                else:
2413                    # parse string
2414                    fid = arr[1].split(']')
2415                    if fid[0].find("<") != -1:
2416                        # numerical
2417                        parse = fid[0].split('<')
2418                        splits.append(float(parse[1]))
2419                        categories.append(None)
2420                    elif fid[0].find(":{") != -1:
2421                        # categorical
2422                        parse = fid[0].split(":")
2423                        cats = parse[1][1:-1]  # strip the {}
2424                        cats = cats.split(",")
2425                        splits.append(float("NAN"))
2426                        categories.append(cats if cats else None)
2427                    else:
2428                        raise ValueError("Failed to parse model text dump.")
2429                    stats = re.split('=|,', fid[1])
2430
2431                    # append to lists
2432                    tree_ids.append(i)
2433                    node_ids.append(int(re.findall(r'\b\d+\b', arr[0])[0]))
2434                    fids.append(parse[0])
2435                    str_i = str(i)
2436                    y_directs.append(str_i + '-' + stats[1])
2437                    n_directs.append(str_i + '-' + stats[3])
2438                    missings.append(str_i + '-' + stats[5])
2439                    gains.append(float(stats[7]))
2440                    covers.append(float(stats[9]))
2441
2442        ids = [str(t_id) + '-' + str(n_id) for t_id, n_id in zip(tree_ids, node_ids)]
2443        df = DataFrame({'Tree': tree_ids, 'Node': node_ids, 'ID': ids,
2444                        'Feature': fids, 'Split': splits, 'Yes': y_directs,
2445                        'No': n_directs, 'Missing': missings, 'Gain': gains,
2446                        'Cover': covers, "Category": categories})
2447
2448        if callable(getattr(df, 'sort_values', None)):
2449            # pylint: disable=no-member
2450            return df.sort_values(['Tree', 'Node']).reset_index(drop=True)
2451        # pylint: disable=no-member
2452        return df.sort(['Tree', 'Node']).reset_index(drop=True)
2453
2454    def _validate_features(self, data: DMatrix):
2455        """
2456        Validate Booster and data's feature_names are identical.
2457        Set feature_names and feature_types from DMatrix
2458        """
2459        if data.num_row() == 0:
2460            return
2461
2462        if self.feature_names is None:
2463            self.feature_names = data.feature_names
2464            self.feature_types = data.feature_types
2465        if data.feature_names is None and self.feature_names is not None:
2466            raise ValueError(
2467                "training data did not have the following fields: " +
2468                ", ".join(self.feature_names)
2469            )
2470        # Booster can't accept data with different feature names
2471        if self.feature_names != data.feature_names:
2472            dat_missing = set(self.feature_names) - set(data.feature_names)
2473            my_missing = set(data.feature_names) - set(self.feature_names)
2474
2475            msg = 'feature_names mismatch: {0} {1}'
2476
2477            if dat_missing:
2478                msg += ('\nexpected ' + ', '.join(
2479                    str(s) for s in dat_missing) + ' in input data')
2480
2481            if my_missing:
2482                msg += ('\ntraining data did not have the following fields: ' +
2483                        ', '.join(str(s) for s in my_missing))
2484
2485            raise ValueError(msg.format(self.feature_names, data.feature_names))
2486
2487    def get_split_value_histogram(
2488        self,
2489        feature: str,
2490        fmap: Union[os.PathLike, str] = '',
2491        bins: Optional[int] = None,
2492        as_pandas: bool = True
2493    ) -> Union[np.ndarray, DataFrame]:
2494        """Get split value histogram of a feature
2495
2496        Parameters
2497        ----------
2498        feature: str
2499            The name of the feature.
2500        fmap: str or os.PathLike (optional)
2501            The name of feature map file.
2502        bin: int, default None
2503            The maximum number of bins.
2504            Number of bins equals number of unique split values n_unique,
2505            if bins == None or bins > n_unique.
2506        as_pandas: bool, default True
2507            Return pd.DataFrame when pandas is installed.
2508            If False or pandas is not installed, return numpy ndarray.
2509
2510        Returns
2511        -------
2512        a histogram of used splitting values for the specified feature
2513        either as numpy array or pandas DataFrame.
2514        """
2515        xgdump = self.get_dump(fmap=fmap)
2516        values = []
2517        # pylint: disable=consider-using-f-string
2518        regexp = re.compile(r"\[{0}<([\d.Ee+-]+)\]".format(feature))
2519        for i, _ in enumerate(xgdump):
2520            m = re.findall(regexp, xgdump[i])
2521            values.extend([float(x) for x in m])
2522
2523        n_unique = len(np.unique(values))
2524        bins = max(min(n_unique, bins) if bins is not None else n_unique, 1)
2525
2526        nph = np.histogram(values, bins=bins)
2527        nph = np.column_stack((nph[1][1:], nph[0]))
2528        nph = nph[nph[:, 1] > 0]
2529
2530        if nph.size == 0:
2531            ft = self.feature_types
2532            fn = self.feature_names
2533            if fn is None:
2534                # Let xgboost generate the feature names.
2535                fn = [f"f{i}" for i in range(self.num_features())]
2536            try:
2537                index = fn.index(feature)
2538                feature_t: Optional[str] = cast(List[str], ft)[index]
2539            except (ValueError, AttributeError, TypeError):
2540                # None.index: attr err, None[0]: type err, fn.index(-1): value err
2541                feature_t = None
2542            if feature_t == "c":  # categorical
2543                raise ValueError(
2544                    "Split value historgam doesn't support categorical split."
2545                )
2546
2547        if as_pandas and PANDAS_INSTALLED:
2548            return DataFrame(nph, columns=['SplitValue', 'Count'])
2549        if as_pandas and not PANDAS_INSTALLED:
2550            warnings.warn(
2551                "Returning histogram as ndarray"
2552                " (as_pandas == True, but pandas is not installed).",
2553                UserWarning
2554            )
2555        return nph
2556