1# -*- coding: utf-8 -*- 2# (c) Copyright by Pierre-Henri Wuillemin (LIP6), 2020 (pierre-henri.wuillemin@lip6.fr) 3 4# Permission to use, copy, modify, and distribute this 5# software and its documentation for any purpose and 6# without fee or royalty is hereby granted, provided 7# that the above copyright notice appear in all copies 8# and that both that copyright notice and this permission 9# notice appear in supporting documentation or portions 10# thereof, including modifications, that you make. 11 12# THE AUTHOR P.H. WUILLEMIN DISCLAIMS ALL WARRANTIES 13# WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED 14# WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT 15# SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, INDIRECT 16# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER 17# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 18# IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 19# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 20# OR PERFORMANCE OF THIS SOFTWARE! 21 22 23import pandas 24import numpy 25import os 26import tempfile 27import warnings 28 29import sklearn 30 31import pyAgrum as gum 32 33from .discretizer import BNDiscretizer 34from ._utils import _ImplementPrior as IPrior 35from ._utils import _CalculateThreshold as CThreshold 36from ._utils import _DFNames as DFNames 37from ._utils import _createCSVfromNDArrays as CSV 38 39from ._MBCalcul import compileMarkovBlanket 40from ._MBCalcul import _calcul_proba_for_binary_class, _calcul_most_probable_for_nary_class, _calcul_proba_for_nary_class 41 42from ._learningMethods import _fitStandard as BN_fitStandard 43from ._learningMethods import _fitNaiveBayes as BN_fitNaiveBayes 44from ._learningMethods import _fitTAN as BN_fitTAN 45from ._learningMethods import _fitChowLiu as BN_fitChowLiu 46 47 48class BNClassifier(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin): 49 """ Represents a (scikit-learn compliant) classifier wich uses a BN to classify. A BNClassifier is build using 50 51 - a Bayesian network, 52 - a database and a learning algorithm and parameters 53 - the use of BNDiscretizer to discretize with different algorithms some variables. 54 55 56 parameters: 57 learningMethod: str 58 A string designating which type of learning we want to use. Possible values are: Chow-Liu, NaiveBayes, 59 TAN, MIIC + (MDL ou NML), GHC, 3off2 + (MDL ou NML), Tabu. 60 GHC designates Greedy Hill Climbing. 61 MIIC designates Multivariate Information based Inductive Causation 62 TAN designates Tree-augmented NaiveBayes 63 Tabu designated Tabu list searching 64 65 aPriori: str 66 A string designating the type of a priori smoothing we want to use. Possible values are Smoothing, 67 BDeu, Dirichlet and NoPrior . 68 Note: if using Dirichlet smoothing DirichletCsv cannot be set to none 69 By default (when aPriori is None) : a smoothing(0.01) is applied. 70 71 scoringType: str 72 A string designating the type of scoring we want to use. Since scoring is used while constructing the 73 network and not when learning its parameters, the scoring will be ignored if using a learning algorithm 74 with a fixed network structure such as Chow-Liu, TAN or NaiveBayes. 75 possible values are: AIC, BIC, BD, BDeu, K2, Log2 76 AIC means Akaike information criterion 77 BIC means Bayesian Information criterion 78 BD means Bayesian-Dirichlet scoring 79 BDeu means Bayesian-Dirichlet equivalent uniform 80 Log2 means log2 likelihood ratio test 81 82 constraints: dict() 83 A dictionary designating the constraints that we want to put on the structure of the Bayesian network. 84 Ignored if using a learning algorithm where the structure is fixed such as TAN or NaiveBayes. 85 the keys of the dictionary should be the strings "PossibleEdges" , "MandatoryArcs" and "ForbiddenArcs". 86 The format of the values should be a tuple of strings (tail,head) which designates the string arc from 87 tail to head. For example if we put the value ("x0"."y") in MandatoryArcs the network will surely have 88 an arc going from x0 to y. 89 Note: PossibleEdges allows for both (tail,head) and (head,tail) to be added to the Bayesian network, 90 while the others are not symmetric. 91 92 aPrioriWeight: double 93 The weight used for a priori smoothing. 94 95 possibleSkeleton: pyagrum.undigraph 96 An undirected graph that serves as a possible skeleton for the Bayesian network 97 98 DirichletCsv: str 99 the file name of the csv file we want to use for the dirichlet prior. Will be ignored if aPriori is not 100 set to Dirichlet. 101 102 discretizationStrategy: str 103 sets the default method of discretization for this discretizer. This method will be used if the user has 104 not specified another method for that specific variable using the setDiscretizationParameters method 105 possible values are: 'quantile', 'uniform', 'kmeans', 'NML', 'CAIM' and 'MDLP' 106 107 defaultNumberOfBins: str or int 108 sets the number of bins if the method used is quantile, kmeans, uniform. In this case this parameter can 109 also be set to the string 'elbowMethod' so that the best number of bins is found automatically. 110 If the method used is NML, this parameter sets the the maximum number of bins up to which the NML 111 algorithm searches for the optimal number of bins. In this case this parameter must be an int 112 If any other discetization method is used, this parameter is ignored. 113 114 discretizationThreshold: int or float 115 When using default parameters a variable will be treated as continous only if it has more unique values 116 than this number (if the number is an int greater than 1). 117 If the number is a float between 0 and 1, we will test if the proportion of unique values is bigger than 118 this number. 119 For instance, if you have entered 0.95, the variable will be treated as continous only if more than 95% 120 of its values are unique. 121 122 usePR: bool 123 indicates if the threshold to choose is Prevision-Recall curve's threhsold or ROC's threshold by 124 default. 125 ROC curves should be used when there are roughly equal numbers of observations for each class. 126 Precision-Recall curves should be used when there is a moderate to large class imbalance especially for 127 the target's class. 128 129 significant_digit: 130 number of significant digits when computing probabilities 131 """ 132 133 def __init__(self, learningMethod="GHC", aPriori=None, scoringType="BIC", constraints=None, aPrioriWeight=1, 134 possibleSkeleton=None, DirichletCsv=None, discretizationStrategy="quantile", discretizationNbBins=5, 135 discretizationThreshold=25, usePR=False, significant_digit=10): 136 """ parameters: 137 learningMethod: str 138 A string designating which type of learning we want to use. Possible values are: Chow-Liu, NaiveBayes, 139 TAN, MIIC + (MDL ou NML), GHC, 3off2 + (MDL ou NML), Tabu. 140 GHC designates Greedy Hill Climbing. 141 MIIC designates Multivariate Information based Inductive Causation 142 TAN designates Tree-augmented NaiveBayes 143 Tabu designated Tabu list searching 144 145 aPriori: str 146 A string designating the type of a priori smoothing we want to use. Possible values are Smoothing, BDeu , 147 Dirichlet and NoPrior. 148 Note: if using Dirichlet smoothing DirichletCsv cannot be set to none 149 150 scoringType: str 151 A string designating the type of scoring we want to use. Since scoring is used while constructing the 152 network and not when learning its parameters, the scoring will be ignored if using a learning algorithm 153 with a fixed network structure such as Chow-Liu, TAN or NaiveBayes. 154 possible values are: AIC, BIC, BD, BDeu, K2, Log2 155 AIC means Akaike information criterion 156 BIC means Bayesian Information criterion 157 BD means Bayesian-Dirichlet scoring 158 BDeu means Bayesian-Dirichlet equivalent uniform 159 Log2 means log2 likelihood ratio test 160 161 constraints: dict() 162 A dictionary designating the constraints that we want to put on the structure of the Bayesian network. 163 Ignored if using a learning algorithm where the structure is fixed such as TAN or NaiveBayes. 164 the keys of the dictionary should be the strings "PossibleEdges" , "MandatoryArcs" and "ForbiddenArcs". 165 The format of the values should be a tuple of strings (tail,head) which designates the string arc from 166 tail to head. For example if we put the value ("x0"."y") in MandatoryArcs the network will surely have 167 an arc going from x0 to y. 168 Note: PossibleEdges allows for both (tail,head) and (head,tail) to be added to the Bayesian network, 169 while the others are not symmetric. 170 171 aPrioriWeight: double 172 The weight used for a priori smoothing. 173 174 possibleSkeleton: pyagrum.undigraph 175 An undirected graph that serves as a possible skeleton for the Bayesian network 176 177 DirichletCsv: str 178 the file name of the csv file we want to use for the dirichlet prior. Will be ignored if aPriori is not 179 set to Dirichlet. 180 181 discretizationStrategy: str 182 sets the default method of discretization for this discretizer. This method will be used if the user has 183 not specified another method for that specific variable using the setDiscretizationParameters method 184 possible values are: 'quantile', 'uniform', 'kmeans', 'NML', 'CAIM' and 'MDLP' 185 186 defaultNumberOfBins: str or int 187 sets the number of bins if the method used is quantile, kmeans, uniform. In this case this parameter can 188 also be set to the string 'elbowMethod' so that the best number of bins is found automatically. 189 If the method used is NML, this parameter sets the the maximum number of bins up to which the NML 190 algorithm searches for the optimal number of bins. In this case this parameter must be an int 191 If any other discetization method is used, this parameter is ignored. 192 193 discretizationThreshold: int or float 194 When using default parameters a variable will be treated as continous only if it has more unique values 195 than this number (if the number is an int greater than 1). 196 If the number is a float between 0 and 1, we will test if the proportion of unique values is bigger than 197 this number. 198 For instance, if you have entered 0.95, the variable will be treated as continous only if more than 95% 199 of its values are unique. 200 201 usePR: bool 202 indicates if the threshold to choose is Prevision-Recall curve's threhsold or ROC's threshold by 203 default. 204 ROC curves should be used when there are roughly equal numbers of observations for each class. 205 Precision-Recall curves should be used when there is a moderate to large class imbalance especially for 206 the target's class. 207 208 significant_digit: 209 number of significant digits when computing probabilities 210 """ 211 212 # The method of learning used 213 self.learningMethod = learningMethod 214 215 # An object used to stock the learner object from pyAgrum 216 self.learner = None 217 218 # Used to stock the Bayesian network learned 219 self.bn = None 220 221 # The threshold used for predicting the class. THe algorithm calculates the probability of a certain class, the 222 # classifier designates it as that class only if the probability is higher than the threshold. 223 # The ROC curve is used to calculate the optimal threshold 224 self.threshold = 0.5 225 self.usePR = usePR 226 227 # the type of a priori smoothing used 228 self.aPriori = aPriori 229 230 # the weight used for the a priori smoothing 231 self.aPrioriWeight = aPrioriWeight 232 233 # the type of scoring used 234 self.scoringType = scoringType 235 236 # the constraints forced onto the structure of the Bayesian network 237 self.constraints = constraints 238 239 self.possibleSkeleton = possibleSkeleton 240 241 self.DirichletCsv = DirichletCsv 242 243 self.MarkovBlanket = None 244 245 self.significant_digit = significant_digit 246 247 self.discretizationNbBins = discretizationNbBins 248 self.discretizationStrategy = discretizationStrategy 249 self.discretizationThreshold = discretizationThreshold 250 self.discretizer = BNDiscretizer( 251 discretizationStrategy, discretizationNbBins, discretizationThreshold) 252 253 # AJOUT FROM MODEL 254 255 # boolean that tells us whether this classifier is obtained from an already trained model (using the function 256 # fromTrainedModel) or not 257 self.fromModel = False 258 259 self.label = '1.0' 260 261 # the name of the target variable 262 self.target = 'y' 263 264 # the type of the target variable 265 self.targetType = None 266 267 # dict(str:int) 268 # The keys of this dictionary are the names of the variables. The value associeted to each name is 269 # the index of the variable. 270 self.variableNameIndexDictionary = None 271 272 def fit(self, X=None, y=None, filename=None, targetName=None): 273 """ 274 parameters: 275 X: {array-like, sparse matrix} of shape (n_samples, n_features) 276 training data. Warning: Raises ValueError if either filename or targetname is not None. Raises ValueError 277 if y is None. 278 y: array-like of shape (n_samples) 279 Target values. Warning: Raises ValueError if either filename or targetname is not None. Raises ValueError 280 if X is None 281 filename: str 282 specifies the csv file where the training data and target values are located. Warning: Raises ValueError 283 if either X or y is not None. Raises ValueError if targetName is None 284 targetName: str 285 specifies the name of the targetVariable in the csv file. Warning: Raises ValueError if either X or y is 286 not None. Raises ValueError if filename is None. 287 returns: 288 void 289 290 Fits the model to the training data provided. The two possible uses of this function are fit(X,y) and fit(filename, 291 targetName). Any other combination will raise a ValueError 292 """ 293 294 if filename is None: 295 if targetName is not None: 296 raise ValueError( 297 "This function should be used either as fit(X,y) or fit(filename=...,targetAttribute=...). You have set " 298 "filename to None, but have entered a targetName") 299 if X is None or y is None: 300 raise ValueError( 301 "This function should be used either as fit(X,y) or fit(filename=...,targetAttribute=...). You have not " 302 "entered a csv file name and not specified the X and y matrices that should be used") 303 else: 304 if targetName is None: 305 raise ValueError( 306 "This function should be used either as fit(X,y) or fit(filename=...,targetAttribute=...). The name of the " 307 "target must be specified if using this function with a csv file.") 308 if X is not None or y is not None: 309 raise ValueError( 310 "This function should be used either as fit(X,y) or fit(filename=...,targetAttribute=...). You have entered " 311 "a filename and the X and y matrices at the same time.") 312 X, y = self.XYfromCSV(filename, True, targetName) 313 314 self.fromModel = False 315 variableNames = None 316 self.discretizer.clear() 317 318 if type(y) == pandas.DataFrame: 319 self.target = y.columns.tolist()[0] 320 elif type(y) == pandas.core.series.Series: 321 self.target = y.name 322 else: 323 self.target = 'y' 324 325 if type(X) == pandas.DataFrame: 326 variableNames = X.columns.tolist() 327 328 # verifies the shape of the two arrays 329 X, y = sklearn.utils.check_X_y(X, y, dtype=None, accept_sparse=True) 330 331 d = X.shape[1] 332 333 if variableNames is None: 334 variableNames = ["x" + str(i) for i in range(d)] 335 336 self.variableNameIndexDictionary = dict() 337 338 for i in range(d): 339 self.variableNameIndexDictionary[variableNames[i]] = i 340 341 self.targetType = y.dtype 342 343 possibleValuesY = numpy.unique(y) 344 345 if len(possibleValuesY) == 1: 346 raise ValueError( 347 "There is only 1 possible values for Y in the data provided") 348 if len(possibleValuesY) > 10: 349 warnings.warn(f"A classifier with too many possible values for Y (here : {possibleValuesY}) in the data provided is not meaningfull (" 350 "please use regression methods instead).") 351 352 self.isBinaryClassifier = (len(possibleValuesY) == 2) 353 354 self.bn = gum.BayesNet('Template') 355 356 var = gum.LabelizedVariable(self.target, self.target, 0) 357 for value in possibleValuesY: 358 var.addLabel(str(value)) 359 self.bn.add(var) 360 361 for i in range(d): 362 var = self.discretizer.createVariable( 363 variableNames[i], X[:, i], y, possibleValuesY) 364 self.bn.add(var) 365 366 csvfile = tempfile.NamedTemporaryFile(delete=False) 367 csvfilename = csvfile.name + ".csv" 368 csvfile.close() 369 370 CSV(X, y, self.target, self.variableNameIndexDictionary, csvfilename) 371 372 self.learner = gum.BNLearner(csvfilename, self.bn) 373 374 IPrior(self.aPriori, self.learner, self.aPrioriWeight, self.DirichletCsv) 375 376 if self.learningMethod == 'NaiveBayes': 377 self.bn = BN_fitNaiveBayes( 378 X, y, self.bn, self.learner, variableNames, self.target, self.constraints) 379 elif self.learningMethod == 'TAN': 380 self.bn = BN_fitTAN(X, y, self.bn, self.learner, 381 variableNames, self.target) 382 elif self.learningMethod == 'Chow-Liu': 383 self.bn = BN_fitChowLiu(X, y, self.bn, self.learner, 384 variableNames, self.target) 385 else: 386 self.bn = BN_fitStandard(X, y, self.learner, self.learningMethod, self.possibleSkeleton, self.scoringType, 387 self.constraints) 388 389 self.label = self.bn.variableFromName(self.target).labels()[1] 390 391 self.MarkovBlanket = compileMarkovBlanket(self.bn, self.target) 392 393 if self.isBinaryClassifier: 394 self.threshold = CThreshold( 395 self.MarkovBlanket, self.target, csvfilename, self.usePR, self.significant_digit) 396 397 os.remove(csvfilename) 398 399 def fromTrainedModel(self, bn, targetAttribute, targetModality="", copy=False, threshold=0.5, variableList=None): 400 """ 401 parameters: 402 bn: pyagrum.BayesNet 403 The Bayesian network we want to use for this classifier 404 targetAttribute: str 405 the attribute that will be the target in this classifier 406 targetModality: str 407 If this is a binary classifier we have to specify which modality we are looking at if the target 408 attribute has more than 2 possible values 409 if !="", a binary classifier is created. 410 if =="", a classifier is created that can be non binary depending on the number of modalities for targetAttribute. If binary, the second one is taken as targetModality. 411 copy: bool 412 Indicates whether we want to put a copy of bn in the classifier, or bn itself. 413 threshold: double 414 The classification threshold. If the probability that the target modality is true is larger than this 415 threshold we predict that modality 416 variableList: list(str) 417 A list of strings. variableList[i] is the name of the variable that has the index i. We use this information 418 when calling predict to know which column corresponds to which variable. 419 If this list is set to none, then we use the order in which the variables were added to the network. 420 421 returns: 422 void 423 424 Creates a BN classifier from an already trained pyAgrum Bayesian network 425 """ 426 427 self.fromModel = True 428 429 # the set of the names of all the variables in the Bayesian network 430 namesSet = set(bn.names()) 431 432 # The target specified must be a variable in the Bayesian network 433 if targetAttribute not in namesSet: 434 raise ValueError( 435 "the target variable does not appear in the Bayesian network") 436 437 self.target = targetAttribute 438 439 self.learner = None 440 441 if copy: 442 self.bn = gum.BayesNet(bn) 443 else: 444 self.bn = bn 445 446 self.threshold = threshold 447 448 self.MarkovBlanket = compileMarkovBlanket(self.bn, self.target) 449 450 self.variableNameIndexDictionary = dict() 451 # if the user specified an order for the variables then we use this order 452 if variableList is not None: 453 454 if len(namesSet) - 1 != len(variableList): 455 raise ValueError( 456 "variableList should include all variables in the Bayesian network except the target") 457 458 i = 0 459 for name in variableList: 460 if name not in namesSet: 461 raise ValueError( 462 "variableList includes a name that does not appear in the Bayesian network") 463 self.variableNameIndexDictionary[name] = i 464 i = i + 1 465 466 # if the user didn't specify an order we use the order that the variables were added in 467 else: 468 variableList = bn.names() 469 i = 0 470 for name in variableList: 471 if name == self.target: 472 continue 473 self.variableNameIndexDictionary[name] = i 474 i = i + 1 475 476 if targetModality != "": 477 self.isBinaryClassifier = True 478 self.label = targetModality 479 else: 480 if self.bn.variableFromName(self.target).domainSize() == 2: 481 self.isBinaryClassifier = True 482 self.label = self.bn.variableFromName(self.target).labels()[ 483 1] # we take the label 1 as targetModality 484 else: 485 self.isBinaryClassifier = False 486 487 def changeVariableName(self, oldName, newName): 488 """ 489 parameters: 490 oldName: str 491 the old name of the variable 492 newName: str 493 the new name of the variable 494 returns: 495 void 496 497 changes the name of a variable inside the Bayesian network 498 """ 499 if oldName == self.target: 500 self.bn.changeVariableName(oldName, newName) 501 self.target = newName 502 self.MarkovBlanket.changeVariableName(oldName, newName) 503 return 504 505 if oldName not in self.variableNameIndexDictionary: 506 raise ValueError( 507 "The oldName you have specified is not a name of a variable in the Bayesian network") 508 index = self.variableNameIndexDictionary.pop(oldName) 509 510 self.variableNameIndexDictionary[newName] = index 511 512 self.bn.changeVariableName(oldName, newName) 513 514 if oldName in self.MarkovBlanket.names(): 515 self.MarkovBlanket.changeVariableName(oldName, newName) 516 517 # ------------------methode Markov Blanket et predict--------------------- 518 519 def predict(self, X, with_labels=True): 520 """ 521 parameters: 522 X: {array-like, sparse matrix} of shape (n_samples, n_features) or str 523 test data, can be either dataFrame, matrix or name of a csv file 524 with_labels: bool 525 tells us whether the csv includes the labels themselves or their indexes. 526 returns: 527 y: array-like of shape (n_samples,) 528 Predicted classes 529 530 Predicts the most likely class for each row of input data, with bn's Markov Blanket 531 """ 532 if type(X) == str: 533 X, _ = self.XYfromCSV(X, target=self.target) 534 535 if type(X) == pandas.DataFrame: 536 dictName = DFNames(X) 537 else: 538 dictName = self.variableNameIndexDictionary 539 540 if self.fromModel: 541 X = sklearn.utils.check_array(X, dtype='str', ensure_2d=False) 542 else: 543 X = sklearn.utils.check_array(X, dtype=None, ensure_2d=False) 544 545 if self.isBinaryClassifier: 546 returned_list = self._binary_predict(X, dictName) 547 else: 548 returned_list = self._nary_predict(X, dictName, with_labels) 549 550 returned_list = numpy.array(returned_list) 551 if not self.fromModel: 552 if self.targetType == "bool": 553 returned_list = returned_list == "True" 554 elif numpy.issubdtype(self.targetType, numpy.number): 555 returned_list = returned_list.astype('float') 556 557 return returned_list 558 559 def _nary_predict(self, X, dictName, with_labels): 560 """ 561 For a classifier, predicts the most likely class for each row of input data, with bn's Markov Blanket 562 563 :param X: data 564 :param dictName: dictionnary of the name of a variable and his column in the data base 565 :param with_labels: tells us whether the csv includes the labels themselves or their indexes. 566 :return: 567 """ 568 returned_list = [] 569 I = self.MarkovBlanket.completeInstantiation() 570 I.erase(self.target) 571 for x in X: 572 vals, _ = _calcul_most_probable_for_nary_class( 573 x, I, dictName, self.MarkovBlanket, self.target) 574 if with_labels: 575 returned_list.append(self.MarkovBlanket.variable( 576 self.target).label(vals[0][self.target])) 577 else: 578 returned_list.append(vals[0][self.target]) 579 580 return returned_list 581 582 def _binary_predict(self, X, dictName): 583 """ 584 For a binary classifier, predicts the most likely class for each row of input data, with bn's Markov Blanket 585 586 :param X: data 587 :param dictName: dictionnary of the name of a variable and his column in the data base 588 :return: 589 """ 590 returned_list = [] 591 # list of other labels of the target 592 labels = [self.bn.variable(self.target).label(i) 593 for i in range(self.bn.variable(self.target).domainSize()) 594 if self.bn.variable(self.target).label(i) != self.label] 595 596 # negative value to add to the list returned 597 label0 = labels[0] 598 # label of the target 599 label1 = self.label 600 # Instantiation use to apply values of the data base 601 I = self.MarkovBlanket.completeInstantiation() 602 # read through data base's ligns 603 for x in X: 604 res = round(_calcul_proba_for_binary_class(x, label1, labels, I, dictName, self.MarkovBlanket, self.target), 605 self.significant_digit) 606 607 if res >= self.threshold: # Positive value predicted 608 if self.fromModel: 609 returned_list.append(True) 610 else: 611 returned_list.append(label1) 612 else: # Negative value predicted 613 if self.fromModel: 614 returned_list.append(False) 615 else: 616 returned_list.append(label0) 617 618 return returned_list 619 620 # ------------------interaction with sklearn, pour ROC et Precision-Recall --------------------- 621 622 def predict_proba(self, X): 623 """ 624 parameters: 625 X: {array-like, sparse matrix} of shape (n_samples, n_features) or str 626 test data, can be either dataFrame, matrix or name of a csv file 627 returns: 628 y: array-like of shape (n_samples,) 629 Predicted probability for each classes 630 631 Predicts the probability of classes for each row of input data, with bn's Markov Blanket 632 """ 633 634 # dictionnary of the name of a variable and his column in the data base 635 dictName = self.variableNameIndexDictionary 636 637 if type(X) == pandas.DataFrame: 638 dictName = DFNames(X) 639 vals = X.to_numpy() 640 elif type(X) == str: 641 vals, _ = self.XYfromCSV(X, target=self.target) 642 dictName = DFNames(vals, vals) 643 vals = vals.to_numpy() 644 else: 645 vals = X 646 647 if self.fromModel: 648 vals = sklearn.utils.check_array(vals, dtype='str', ensure_2d=False) 649 else: 650 sklearn.utils.check_array(vals, dtype=None, ensure_2d=False) 651 652 returned_list = [] 653 654 # label of the target 655 label1 = self.label 656 # list of other labels of the target 657 labels = [self.bn.variable(self.target).label(i) 658 for i in range(self.bn.variable(self.target).domainSize()) 659 if self.bn.variable(self.target).label(i) != self.label] 660 661 # Instantiation use to apply values of the data base 662 I = self.MarkovBlanket.completeInstantiation() 663 664 # read through data base's ligns 665 if self.isBinaryClassifier: 666 for x in vals: 667 res = round(_calcul_proba_for_binary_class(x, label1, labels, I, 668 dictName, self.MarkovBlanket, self.target), self.significant_digit) 669 returned_list.append([1 - res, res]) 670 else: 671 local_inst = gum.Instantiation(I) 672 local_inst.erase(self.target) 673 for x in vals: 674 returned_list.append(_calcul_proba_for_nary_class( 675 x, local_inst, dictName, self.MarkovBlanket, self.target).tolist()) 676 677 return numpy.array(returned_list) 678 679 # ------------------ BNClassifier compatible from pyAgrum to sklearn --------------------- 680 681 def XYfromCSV(self, filename, with_labels=True, target=None): 682 """ 683 parameters: 684 filename: str 685 the name of the csv file 686 with_labels: bool 687 tells us whether the csv includes the labels themselves or their indexes. 688 target: str or None 689 The name of the column that will be put in the dataframe y. If target is None, we use the target that is 690 already specified in the classifier 691 returns: 692 X: pandas.dataframe 693 Matrix containing the data 694 y: pandas.dataframe 695 Column-vector containing the class for each data vector in X 696 697 Reads the data from a csv file and separates it into a X matrix and a y column vector. 698 """ 699 700 if self.fromModel: 701 dataframe = pandas.read_csv(filename, dtype='str') 702 else: 703 dataframe = pandas.read_csv(filename) 704 705 if target is None: 706 target = self.target 707 y = dataframe[target] 708 X = dataframe.drop(target, axis=1) 709 710 if not with_labels: 711 variableList = X.columns.tolist() 712 targetVariable = self.bn.variableFromName(target) 713 for index in range(len(variableList)): 714 variableList[index] = self.bn.variableFromName(variableList[index]) 715 for row in X: 716 for i in len(row): 717 row[i] = variableList[i].labels(row[i]) 718 if self.fromModel: 719 if self.isBinaryClassifier: 720 labelIndex = 0 721 labelList = targetVariable.labels() 722 while labelIndex < len(labelList): 723 if labelList[labelIndex] == self.label: 724 break 725 labelIndex += 1 726 y = y == labelIndex 727 else: 728 for index in range(len(y)): 729 y[index] = targetVariable(y[index]) 730 731 elif self.fromModel: 732 y = y.astype('str') 733 if self.isBinaryClassifier: 734 y = y == self.label 735 736 return X, y 737 738 def showROC_PR(self, filename, save_fig=False, show_progress=False): 739 """ 740 Use the `pyAgrum.lib.bn2roc` tools to create ROC and Precision-Recall curve 741 742 parameters: 743 csv_name : str 744 a csv filename 745 save_fig : bool 746 whether the graph soulb de saved 747 show_progress : bool 748 indicates if the resulting curve must be printed 749 """ 750 import pyAgrum.lib.bn2roc as bn2roc 751 bn2roc.showROC_PR(self.bn, filename, self.target, 752 self.label, significant_digits=self.significant_digit, show_progress=show_progress) 753