1# -*- coding: utf-8 -*-
2# (c) Copyright by Pierre-Henri Wuillemin (LIP6), 2020  (pierre-henri.wuillemin@lip6.fr)
3
4# Permission to use, copy, modify, and distribute this
5# software and its documentation for any purpose and
6# without fee or royalty is hereby granted, provided
7# that the above copyright notice appear in all copies
8# and that both that copyright notice and this permission
9# notice appear in supporting documentation or portions
10# thereof, including modifications, that you make.
11
12# THE AUTHOR P.H. WUILLEMIN  DISCLAIMS ALL WARRANTIES
13# WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
14# WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
15# SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, INDIRECT
16# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
17# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
18# IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
19# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
20# OR PERFORMANCE OF THIS SOFTWARE!
21
22
23import pandas
24import numpy
25import os
26import tempfile
27import warnings
28
29import sklearn
30
31import pyAgrum as gum
32
33from .discretizer import BNDiscretizer
34from ._utils import _ImplementPrior as IPrior
35from ._utils import _CalculateThreshold as CThreshold
36from ._utils import _DFNames as DFNames
37from ._utils import _createCSVfromNDArrays as CSV
38
39from ._MBCalcul import compileMarkovBlanket
40from ._MBCalcul import _calcul_proba_for_binary_class, _calcul_most_probable_for_nary_class, _calcul_proba_for_nary_class
41
42from ._learningMethods import _fitStandard as BN_fitStandard
43from ._learningMethods import _fitNaiveBayes as BN_fitNaiveBayes
44from ._learningMethods import _fitTAN as BN_fitTAN
45from ._learningMethods import _fitChowLiu as BN_fitChowLiu
46
47
48class BNClassifier(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):
49  """ Represents a (scikit-learn compliant) classifier wich uses a BN to classify. A BNClassifier is build using
50
51   - a Bayesian network,
52   - a database and a learning algorithm and parameters
53   - the use of BNDiscretizer to discretize with different algorithms some variables.
54
55
56      parameters:
57            learningMethod: str
58                A string designating which type of learning we want to use. Possible values are: Chow-Liu, NaiveBayes,
59                TAN, MIIC + (MDL ou NML), GHC, 3off2 + (MDL ou NML), Tabu.
60                GHC designates Greedy Hill Climbing.
61                MIIC designates Multivariate Information based Inductive Causation
62                TAN designates Tree-augmented NaiveBayes
63                Tabu designated Tabu list searching
64
65            aPriori: str
66                A string designating the type of a priori smoothing we want to use. Possible values are Smoothing,
67                BDeu, Dirichlet and NoPrior .
68                Note: if using Dirichlet smoothing DirichletCsv cannot be set to none
69                By default (when aPriori is None) : a smoothing(0.01) is applied.
70
71            scoringType: str
72                A string designating the type of scoring we want to use. Since scoring is used while constructing the
73                network and not when learning its parameters, the scoring will be ignored if using a learning algorithm
74                with a fixed network structure such as Chow-Liu, TAN or NaiveBayes.
75                possible values are:  AIC, BIC, BD, BDeu, K2, Log2
76                AIC means Akaike information criterion
77                BIC means Bayesian Information criterion
78                BD means Bayesian-Dirichlet scoring
79                BDeu means Bayesian-Dirichlet equivalent uniform
80                Log2 means log2 likelihood ratio test
81
82            constraints: dict()
83                A dictionary designating the constraints that we want to put on the structure of the Bayesian network.
84                Ignored if using a learning algorithm where the structure is fixed such as TAN or NaiveBayes.
85                the keys of the dictionary should be the strings "PossibleEdges" , "MandatoryArcs" and  "ForbiddenArcs".
86                The format of the values should be a tuple of strings (tail,head) which designates the string arc from
87                tail to head. For example if we put the value ("x0"."y") in MandatoryArcs the network will surely have
88                an arc going from x0 to y.
89                Note: PossibleEdges allows for both (tail,head) and (head,tail) to be added to the Bayesian network,
90                while the others are not symmetric.
91
92            aPrioriWeight: double
93                The weight used for a priori smoothing.
94
95            possibleSkeleton: pyagrum.undigraph
96                An undirected graph that serves as a possible skeleton for the Bayesian network
97
98            DirichletCsv: str
99                the file name of the csv file we want to use for the dirichlet prior. Will be ignored if aPriori is not
100                set to Dirichlet.
101
102            discretizationStrategy: str
103                sets the default method of discretization for this discretizer. This method will be used if the user has
104                not specified another method for that specific variable using the setDiscretizationParameters method
105                possible values are: 'quantile', 'uniform', 'kmeans', 'NML', 'CAIM' and 'MDLP'
106
107            defaultNumberOfBins: str or int
108                sets the number of bins if the method used is quantile, kmeans, uniform. In this case this parameter can
109                also be set to the string 'elbowMethod' so that the best number of bins is found automatically.
110                If the method used is NML, this parameter sets the the maximum number of bins up to which the NML
111                algorithm searches for the optimal number of bins. In this case this parameter must be an int
112                If any other discetization method is used, this parameter is ignored.
113
114            discretizationThreshold: int or float
115                When using default parameters a variable will be treated as continous only if it has more unique values
116                than this number (if the number is an int greater than 1).
117                If the number is a float between 0 and 1, we will test if the proportion of unique values is bigger than
118                this number.
119                For instance, if you have entered 0.95, the variable will be treated as continous only if more than 95%
120                of its values are unique.
121
122            usePR: bool
123                indicates if the threshold to choose is Prevision-Recall curve's threhsold or ROC's threshold by
124                default.
125                ROC curves should be used when there are roughly equal numbers of observations for each class.
126                Precision-Recall curves should be used when there is a moderate to large class imbalance especially for
127                the target's class.
128
129            significant_digit:
130                number of significant digits when computing probabilities
131    """
132
133  def __init__(self, learningMethod="GHC", aPriori=None, scoringType="BIC", constraints=None, aPrioriWeight=1,
134               possibleSkeleton=None, DirichletCsv=None, discretizationStrategy="quantile", discretizationNbBins=5,
135               discretizationThreshold=25, usePR=False, significant_digit=10):
136    """ parameters:
137            learningMethod: str
138                A string designating which type of learning we want to use. Possible values are: Chow-Liu, NaiveBayes,
139                TAN, MIIC + (MDL ou NML), GHC, 3off2 + (MDL ou NML), Tabu.
140                GHC designates Greedy Hill Climbing.
141                MIIC designates Multivariate Information based Inductive Causation
142                TAN designates Tree-augmented NaiveBayes
143                Tabu designated Tabu list searching
144
145            aPriori: str
146                A string designating the type of a priori smoothing we want to use. Possible values are Smoothing, BDeu ,
147                Dirichlet and NoPrior.
148                Note: if using Dirichlet smoothing DirichletCsv cannot be set to none
149
150            scoringType: str
151                A string designating the type of scoring we want to use. Since scoring is used while constructing the
152                network and not when learning its parameters, the scoring will be ignored if using a learning algorithm
153                with a fixed network structure such as Chow-Liu, TAN or NaiveBayes.
154                possible values are:  AIC, BIC, BD, BDeu, K2, Log2
155                AIC means Akaike information criterion
156                BIC means Bayesian Information criterion
157                BD means Bayesian-Dirichlet scoring
158                BDeu means Bayesian-Dirichlet equivalent uniform
159                Log2 means log2 likelihood ratio test
160
161            constraints: dict()
162                A dictionary designating the constraints that we want to put on the structure of the Bayesian network.
163                Ignored if using a learning algorithm where the structure is fixed such as TAN or NaiveBayes.
164                the keys of the dictionary should be the strings "PossibleEdges" , "MandatoryArcs" and  "ForbiddenArcs".
165                The format of the values should be a tuple of strings (tail,head) which designates the string arc from
166                tail to head. For example if we put the value ("x0"."y") in MandatoryArcs the network will surely have
167                an arc going from x0 to y.
168                Note: PossibleEdges allows for both (tail,head) and (head,tail) to be added to the Bayesian network,
169                while the others are not symmetric.
170
171            aPrioriWeight: double
172                The weight used for a priori smoothing.
173
174            possibleSkeleton: pyagrum.undigraph
175                An undirected graph that serves as a possible skeleton for the Bayesian network
176
177            DirichletCsv: str
178                the file name of the csv file we want to use for the dirichlet prior. Will be ignored if aPriori is not
179                set to Dirichlet.
180
181            discretizationStrategy: str
182                sets the default method of discretization for this discretizer. This method will be used if the user has
183                not specified another method for that specific variable using the setDiscretizationParameters method
184                possible values are: 'quantile', 'uniform', 'kmeans', 'NML', 'CAIM' and 'MDLP'
185
186            defaultNumberOfBins: str or int
187                sets the number of bins if the method used is quantile, kmeans, uniform. In this case this parameter can
188                also be set to the string 'elbowMethod' so that the best number of bins is found automatically.
189                If the method used is NML, this parameter sets the the maximum number of bins up to which the NML
190                algorithm searches for the optimal number of bins. In this case this parameter must be an int
191                If any other discetization method is used, this parameter is ignored.
192
193            discretizationThreshold: int or float
194                When using default parameters a variable will be treated as continous only if it has more unique values
195                than this number (if the number is an int greater than 1).
196                If the number is a float between 0 and 1, we will test if the proportion of unique values is bigger than
197                this number.
198                For instance, if you have entered 0.95, the variable will be treated as continous only if more than 95%
199                of its values are unique.
200
201            usePR: bool
202                indicates if the threshold to choose is Prevision-Recall curve's threhsold or ROC's threshold by
203                default.
204                ROC curves should be used when there are roughly equal numbers of observations for each class.
205                Precision-Recall curves should be used when there is a moderate to large class imbalance especially for
206                the target's class.
207
208            significant_digit:
209                number of significant digits when computing probabilities
210    """
211
212    # The method of learning used
213    self.learningMethod = learningMethod
214
215    # An object used to stock the learner object from pyAgrum
216    self.learner = None
217
218    # Used to stock the Bayesian network learned
219    self.bn = None
220
221    # The threshold used for predicting the class. THe algorithm calculates the probability of a certain class, the
222    # classifier designates it as that class only if the probability is higher than the threshold.
223    # The ROC curve is used to calculate the optimal threshold
224    self.threshold = 0.5
225    self.usePR = usePR
226
227    # the type of a priori smoothing used
228    self.aPriori = aPriori
229
230    # the weight used for the a priori smoothing
231    self.aPrioriWeight = aPrioriWeight
232
233    # the type of scoring used
234    self.scoringType = scoringType
235
236    # the constraints forced onto the structure of the Bayesian network
237    self.constraints = constraints
238
239    self.possibleSkeleton = possibleSkeleton
240
241    self.DirichletCsv = DirichletCsv
242
243    self.MarkovBlanket = None
244
245    self.significant_digit = significant_digit
246
247    self.discretizationNbBins = discretizationNbBins
248    self.discretizationStrategy = discretizationStrategy
249    self.discretizationThreshold = discretizationThreshold
250    self.discretizer = BNDiscretizer(
251        discretizationStrategy, discretizationNbBins, discretizationThreshold)
252
253    # AJOUT FROM MODEL
254
255    # boolean that tells us whether this classifier is obtained from an already trained model (using the function
256    # fromTrainedModel) or not
257    self.fromModel = False
258
259    self.label = '1.0'
260
261    # the name of the target variable
262    self.target = 'y'
263
264    # the type of the target variable
265    self.targetType = None
266
267    # dict(str:int)
268    # The keys of this dictionary are the names of the variables. The value associeted to each name is
269    # the index of the variable.
270    self.variableNameIndexDictionary = None
271
272  def fit(self, X=None, y=None, filename=None, targetName=None):
273    """
274    parameters:
275        X: {array-like, sparse matrix} of shape (n_samples, n_features)
276            training data. Warning: Raises ValueError if either filename or targetname is not None. Raises ValueError
277            if y is None.
278        y: array-like of shape (n_samples)
279            Target values. Warning: Raises ValueError if either filename or targetname is not None. Raises ValueError
280            if X is None
281        filename: str
282            specifies the csv file where the training data and target values are located. Warning: Raises ValueError
283            if either X or y is not None. Raises ValueError if targetName is None
284        targetName: str
285            specifies the name of the targetVariable in the csv file. Warning: Raises ValueError if either X or y is
286            not None. Raises ValueError if filename is None.
287    returns:
288        void
289
290    Fits the model to the training data provided. The two possible uses of this function are fit(X,y) and fit(filename,
291    targetName). Any other combination will raise a ValueError
292    """
293
294    if filename is None:
295      if targetName is not None:
296        raise ValueError(
297            "This function should be used either as fit(X,y) or fit(filename=...,targetAttribute=...). You have set "
298            "filename to None, but have entered a targetName")
299      if X is None or y is None:
300        raise ValueError(
301            "This function should be used either as fit(X,y) or fit(filename=...,targetAttribute=...). You have not "
302            "entered a csv file name and not specified the X and y matrices that should be used")
303    else:
304      if targetName is None:
305        raise ValueError(
306            "This function should be used either as fit(X,y) or fit(filename=...,targetAttribute=...). The name of the "
307            "target must be specified if using this function with a csv file.")
308      if X is not None or y is not None:
309        raise ValueError(
310            "This function should be used either as fit(X,y) or fit(filename=...,targetAttribute=...). You have entered "
311            "a filename and the X and y matrices at the same time.")
312      X, y = self.XYfromCSV(filename, True, targetName)
313
314    self.fromModel = False
315    variableNames = None
316    self.discretizer.clear()
317
318    if type(y) == pandas.DataFrame:
319      self.target = y.columns.tolist()[0]
320    elif type(y) == pandas.core.series.Series:
321      self.target = y.name
322    else:
323      self.target = 'y'
324
325    if type(X) == pandas.DataFrame:
326      variableNames = X.columns.tolist()
327
328    # verifies the shape of the two arrays
329    X, y = sklearn.utils.check_X_y(X, y, dtype=None, accept_sparse=True)
330
331    d = X.shape[1]
332
333    if variableNames is None:
334      variableNames = ["x" + str(i) for i in range(d)]
335
336    self.variableNameIndexDictionary = dict()
337
338    for i in range(d):
339      self.variableNameIndexDictionary[variableNames[i]] = i
340
341    self.targetType = y.dtype
342
343    possibleValuesY = numpy.unique(y)
344
345    if len(possibleValuesY) == 1:
346      raise ValueError(
347          "There is only 1 possible values for Y in the data provided")
348    if len(possibleValuesY) > 10:
349      warnings.warn(f"A classifier with too many possible values for Y (here : {possibleValuesY}) in the data provided is not meaningfull ("
350                    "please use regression methods instead).")
351
352    self.isBinaryClassifier = (len(possibleValuesY) == 2)
353
354    self.bn = gum.BayesNet('Template')
355
356    var = gum.LabelizedVariable(self.target, self.target, 0)
357    for value in possibleValuesY:
358      var.addLabel(str(value))
359    self.bn.add(var)
360
361    for i in range(d):
362      var = self.discretizer.createVariable(
363          variableNames[i], X[:, i], y, possibleValuesY)
364      self.bn.add(var)
365
366    csvfile = tempfile.NamedTemporaryFile(delete=False)
367    csvfilename = csvfile.name + ".csv"
368    csvfile.close()
369
370    CSV(X, y, self.target, self.variableNameIndexDictionary, csvfilename)
371
372    self.learner = gum.BNLearner(csvfilename, self.bn)
373
374    IPrior(self.aPriori, self.learner, self.aPrioriWeight, self.DirichletCsv)
375
376    if self.learningMethod == 'NaiveBayes':
377      self.bn = BN_fitNaiveBayes(
378          X, y, self.bn, self.learner, variableNames, self.target, self.constraints)
379    elif self.learningMethod == 'TAN':
380      self.bn = BN_fitTAN(X, y, self.bn, self.learner,
381                          variableNames, self.target)
382    elif self.learningMethod == 'Chow-Liu':
383      self.bn = BN_fitChowLiu(X, y, self.bn, self.learner,
384                              variableNames, self.target)
385    else:
386      self.bn = BN_fitStandard(X, y, self.learner, self.learningMethod, self.possibleSkeleton, self.scoringType,
387                               self.constraints)
388
389    self.label = self.bn.variableFromName(self.target).labels()[1]
390
391    self.MarkovBlanket = compileMarkovBlanket(self.bn, self.target)
392
393    if self.isBinaryClassifier:
394      self.threshold = CThreshold(
395          self.MarkovBlanket, self.target, csvfilename, self.usePR, self.significant_digit)
396
397    os.remove(csvfilename)
398
399  def fromTrainedModel(self, bn, targetAttribute, targetModality="", copy=False, threshold=0.5, variableList=None):
400    """
401    parameters:
402        bn: pyagrum.BayesNet
403            The Bayesian network we want to use for this classifier
404        targetAttribute: str
405            the attribute that will be the target in this classifier
406        targetModality: str
407            If this is a binary classifier we have to specify which modality we are looking at if the target
408            attribute has more than 2 possible values
409            if  !="", a binary classifier is created.
410            if =="", a classifier is created that can be non binary depending on the number of modalities for targetAttribute. If binary, the second one is taken as targetModality.
411        copy: bool
412            Indicates whether we want to put a copy of bn in the classifier, or bn itself.
413        threshold: double
414            The classification threshold. If the probability that the target modality is true is larger than this
415            threshold we predict that modality
416        variableList: list(str)
417            A list of strings. variableList[i] is the name of the variable that has the index i. We use this information
418            when calling predict to know which column corresponds to which variable.
419            If this list is set to none, then we use the order in which the variables were added to the network.
420
421    returns:
422        void
423
424    Creates a BN classifier from an already trained pyAgrum Bayesian network
425    """
426
427    self.fromModel = True
428
429    # the set of the names of all the variables in the Bayesian network
430    namesSet = set(bn.names())
431
432    # The target specified must be a variable in the Bayesian network
433    if targetAttribute not in namesSet:
434      raise ValueError(
435          "the target variable does not appear in the Bayesian network")
436
437    self.target = targetAttribute
438
439    self.learner = None
440
441    if copy:
442      self.bn = gum.BayesNet(bn)
443    else:
444      self.bn = bn
445
446    self.threshold = threshold
447
448    self.MarkovBlanket = compileMarkovBlanket(self.bn, self.target)
449
450    self.variableNameIndexDictionary = dict()
451    # if the user specified an order for the variables then we use this order
452    if variableList is not None:
453
454      if len(namesSet) - 1 != len(variableList):
455        raise ValueError(
456            "variableList should include all variables in the Bayesian network except the target")
457
458      i = 0
459      for name in variableList:
460        if name not in namesSet:
461          raise ValueError(
462              "variableList includes a name that does not appear in the Bayesian network")
463        self.variableNameIndexDictionary[name] = i
464        i = i + 1
465
466    # if the user didn't specify an order we use the order that the variables were added in
467    else:
468      variableList = bn.names()
469      i = 0
470      for name in variableList:
471        if name == self.target:
472          continue
473        self.variableNameIndexDictionary[name] = i
474        i = i + 1
475
476    if targetModality != "":
477      self.isBinaryClassifier = True
478      self.label = targetModality
479    else:
480      if self.bn.variableFromName(self.target).domainSize() == 2:
481        self.isBinaryClassifier = True
482        self.label = self.bn.variableFromName(self.target).labels()[
483            1]  # we take the label 1 as targetModality
484      else:
485        self.isBinaryClassifier = False
486
487    def changeVariableName(self, oldName, newName):
488      """
489      parameters:
490          oldName: str
491              the old name of the variable
492          newName: str
493              the new name of the variable
494      returns:
495          void
496
497      changes the name of a variable inside the Bayesian network
498      """
499      if oldName == self.target:
500        self.bn.changeVariableName(oldName, newName)
501        self.target = newName
502        self.MarkovBlanket.changeVariableName(oldName, newName)
503        return
504
505      if oldName not in self.variableNameIndexDictionary:
506        raise ValueError(
507            "The oldName you have specified is not a name of a variable in the Bayesian network")
508      index = self.variableNameIndexDictionary.pop(oldName)
509
510      self.variableNameIndexDictionary[newName] = index
511
512      self.bn.changeVariableName(oldName, newName)
513
514      if oldName in self.MarkovBlanket.names():
515        self.MarkovBlanket.changeVariableName(oldName, newName)
516
517  # ------------------methode Markov Blanket et predict---------------------
518
519  def predict(self, X, with_labels=True):
520    """
521    parameters:
522        X: {array-like, sparse matrix} of shape (n_samples, n_features) or str
523            test data, can be either dataFrame, matrix or name of a csv file
524        with_labels: bool
525            tells us whether the csv includes the labels themselves or their indexes.
526    returns:
527        y: array-like of shape (n_samples,)
528            Predicted classes
529
530    Predicts the most likely class for each row of input data, with bn's Markov Blanket
531    """
532    if type(X) == str:
533      X, _ = self.XYfromCSV(X, target=self.target)
534
535    if type(X) == pandas.DataFrame:
536      dictName = DFNames(X)
537    else:
538      dictName = self.variableNameIndexDictionary
539
540    if self.fromModel:
541      X = sklearn.utils.check_array(X, dtype='str', ensure_2d=False)
542    else:
543      X = sklearn.utils.check_array(X, dtype=None, ensure_2d=False)
544
545    if self.isBinaryClassifier:
546      returned_list = self._binary_predict(X, dictName)
547    else:
548      returned_list = self._nary_predict(X, dictName, with_labels)
549
550    returned_list = numpy.array(returned_list)
551    if not self.fromModel:
552      if self.targetType == "bool":
553        returned_list = returned_list == "True"
554      elif numpy.issubdtype(self.targetType, numpy.number):
555        returned_list = returned_list.astype('float')
556
557    return returned_list
558
559  def _nary_predict(self, X, dictName, with_labels):
560    """
561     For a classifier, predicts the most likely class for each row of input data, with bn's Markov Blanket
562
563    :param X: data
564    :param dictName: dictionnary of the name of a variable and his column in the data base
565    :param with_labels: tells us whether the csv includes the labels themselves or their indexes.
566    :return:
567    """
568    returned_list = []
569    I = self.MarkovBlanket.completeInstantiation()
570    I.erase(self.target)
571    for x in X:
572      vals, _ = _calcul_most_probable_for_nary_class(
573          x, I, dictName, self.MarkovBlanket, self.target)
574      if with_labels:
575        returned_list.append(self.MarkovBlanket.variable(
576            self.target).label(vals[0][self.target]))
577      else:
578        returned_list.append(vals[0][self.target])
579
580    return returned_list
581
582  def _binary_predict(self, X, dictName):
583    """
584     For a binary classifier, predicts the most likely class for each row of input data, with bn's Markov Blanket
585
586    :param X: data
587    :param dictName: dictionnary of the name of a variable and his column in the data base
588    :return:
589    """
590    returned_list = []
591    # list of other labels of the target
592    labels = [self.bn.variable(self.target).label(i)
593              for i in range(self.bn.variable(self.target).domainSize())
594              if self.bn.variable(self.target).label(i) != self.label]
595
596    # negative value to add to the list returned
597    label0 = labels[0]
598    # label of the target
599    label1 = self.label
600    # Instantiation use to apply values of the data base
601    I = self.MarkovBlanket.completeInstantiation()
602    # read through data base's ligns
603    for x in X:
604      res = round(_calcul_proba_for_binary_class(x, label1, labels, I, dictName, self.MarkovBlanket, self.target),
605                  self.significant_digit)
606
607      if res >= self.threshold:  # Positive value predicted
608        if self.fromModel:
609          returned_list.append(True)
610        else:
611          returned_list.append(label1)
612      else:  # Negative value predicted
613        if self.fromModel:
614          returned_list.append(False)
615        else:
616          returned_list.append(label0)
617
618    return returned_list
619
620  # ------------------interaction with sklearn, pour ROC et Precision-Recall ---------------------
621
622  def predict_proba(self, X):
623    """
624    parameters:
625        X: {array-like, sparse matrix} of shape (n_samples, n_features) or str
626            test data, can be either dataFrame, matrix or name of a csv file
627    returns:
628        y: array-like of shape (n_samples,)
629            Predicted probability for each classes
630
631    Predicts the probability of classes for each row of input data, with bn's Markov Blanket
632    """
633
634    # dictionnary of the name of a variable and his column in the data base
635    dictName = self.variableNameIndexDictionary
636
637    if type(X) == pandas.DataFrame:
638      dictName = DFNames(X)
639      vals = X.to_numpy()
640    elif type(X) == str:
641      vals, _ = self.XYfromCSV(X, target=self.target)
642      dictName = DFNames(vals, vals)
643      vals = vals.to_numpy()
644    else:
645      vals = X
646
647    if self.fromModel:
648      vals = sklearn.utils.check_array(vals, dtype='str', ensure_2d=False)
649    else:
650      sklearn.utils.check_array(vals, dtype=None, ensure_2d=False)
651
652    returned_list = []
653
654    # label of the target
655    label1 = self.label
656    # list of other labels of the target
657    labels = [self.bn.variable(self.target).label(i)
658              for i in range(self.bn.variable(self.target).domainSize())
659              if self.bn.variable(self.target).label(i) != self.label]
660
661    # Instantiation use to apply values of the data base
662    I = self.MarkovBlanket.completeInstantiation()
663
664    # read through data base's ligns
665    if self.isBinaryClassifier:
666      for x in vals:
667        res = round(_calcul_proba_for_binary_class(x, label1, labels, I,
668                    dictName, self.MarkovBlanket, self.target), self.significant_digit)
669        returned_list.append([1 - res, res])
670    else:
671      local_inst = gum.Instantiation(I)
672      local_inst.erase(self.target)
673      for x in vals:
674        returned_list.append(_calcul_proba_for_nary_class(
675            x, local_inst, dictName, self.MarkovBlanket, self.target).tolist())
676
677    return numpy.array(returned_list)
678
679  # ------------------ BNClassifier compatible from pyAgrum to sklearn ---------------------
680
681  def XYfromCSV(self, filename, with_labels=True, target=None):
682    """
683    parameters:
684        filename: str
685            the name of the csv file
686        with_labels: bool
687            tells us whether the csv includes the labels themselves or their indexes.
688        target: str or None
689            The name of the column that will be put in the dataframe y. If target is None, we use the target that is
690            already specified in the classifier
691    returns:
692        X: pandas.dataframe
693            Matrix containing the data
694        y: pandas.dataframe
695            Column-vector containing the class for each data vector in X
696
697    Reads the data from a csv file and separates it into a X matrix and a y column vector.
698    """
699
700    if self.fromModel:
701      dataframe = pandas.read_csv(filename, dtype='str')
702    else:
703      dataframe = pandas.read_csv(filename)
704
705    if target is None:
706      target = self.target
707    y = dataframe[target]
708    X = dataframe.drop(target, axis=1)
709
710    if not with_labels:
711      variableList = X.columns.tolist()
712      targetVariable = self.bn.variableFromName(target)
713      for index in range(len(variableList)):
714        variableList[index] = self.bn.variableFromName(variableList[index])
715      for row in X:
716        for i in len(row):
717          row[i] = variableList[i].labels(row[i])
718      if self.fromModel:
719        if self.isBinaryClassifier:
720          labelIndex = 0
721          labelList = targetVariable.labels()
722          while labelIndex < len(labelList):
723            if labelList[labelIndex] == self.label:
724              break
725            labelIndex += 1
726          y = y == labelIndex
727      else:
728        for index in range(len(y)):
729          y[index] = targetVariable(y[index])
730
731    elif self.fromModel:
732      y = y.astype('str')
733      if self.isBinaryClassifier:
734        y = y == self.label
735
736    return X, y
737
738  def showROC_PR(self, filename, save_fig=False, show_progress=False):
739    """
740    Use the `pyAgrum.lib.bn2roc` tools to create ROC and Precision-Recall curve
741
742    parameters:
743      csv_name : str
744        a csv filename
745      save_fig : bool
746        whether the graph soulb de saved
747      show_progress : bool
748        indicates if the resulting curve must be printed
749    """
750    import pyAgrum.lib.bn2roc as bn2roc
751    bn2roc.showROC_PR(self.bn, filename, self.target,
752                      self.label, significant_digits=self.significant_digit, show_progress=show_progress)
753