1# Natural Language Toolkit: Interface to scikit-learn classifiers 2# 3# Author: Lars Buitinck <L.J.Buitinck@uva.nl> 4# URL: <http://nltk.org/> 5# For license information, see LICENSE.TXT 6""" 7scikit-learn (http://scikit-learn.org) is a machine learning library for 8Python. It supports many classification algorithms, including SVMs, 9Naive Bayes, logistic regression (MaxEnt) and decision trees. 10 11This package implements a wrapper around scikit-learn classifiers. To use this 12wrapper, construct a scikit-learn estimator object, then use that to construct 13a SklearnClassifier. E.g., to wrap a linear SVM with default settings: 14 15>>> from sklearn.svm import LinearSVC 16>>> from nltk.classify.scikitlearn import SklearnClassifier 17>>> classif = SklearnClassifier(LinearSVC()) 18 19A scikit-learn classifier may include preprocessing steps when it's wrapped 20in a Pipeline object. The following constructs and wraps a Naive Bayes text 21classifier with tf-idf weighting and chi-square feature selection to get the 22best 1000 features: 23 24>>> from sklearn.feature_extraction.text import TfidfTransformer 25>>> from sklearn.feature_selection import SelectKBest, chi2 26>>> from sklearn.naive_bayes import MultinomialNB 27>>> from sklearn.pipeline import Pipeline 28>>> pipeline = Pipeline([('tfidf', TfidfTransformer()), 29... ('chi2', SelectKBest(chi2, k=1000)), 30... ('nb', MultinomialNB())]) 31>>> classif = SklearnClassifier(pipeline) 32""" 33from __future__ import print_function, unicode_literals 34 35from six.moves import zip 36 37from nltk.classify.api import ClassifierI 38from nltk.probability import DictionaryProbDist 39from nltk import compat 40 41try: 42 from sklearn.feature_extraction import DictVectorizer 43 from sklearn.preprocessing import LabelEncoder 44except ImportError: 45 pass 46 47__all__ = ['SklearnClassifier'] 48 49 50@compat.python_2_unicode_compatible 51class SklearnClassifier(ClassifierI): 52 """Wrapper for scikit-learn classifiers.""" 53 54 def __init__(self, estimator, dtype=float, sparse=True): 55 """ 56 :param estimator: scikit-learn classifier object. 57 58 :param dtype: data type used when building feature array. 59 scikit-learn estimators work exclusively on numeric data. The 60 default value should be fine for almost all situations. 61 62 :param sparse: Whether to use sparse matrices internally. 63 The estimator must support these; not all scikit-learn classifiers 64 do (see their respective documentation and look for "sparse 65 matrix"). The default value is True, since most NLP problems 66 involve sparse feature sets. Setting this to False may take a 67 great amount of memory. 68 :type sparse: boolean. 69 """ 70 self._clf = estimator 71 self._encoder = LabelEncoder() 72 self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse) 73 74 def __repr__(self): 75 return "<SklearnClassifier(%r)>" % self._clf 76 77 def classify_many(self, featuresets): 78 """Classify a batch of samples. 79 80 :param featuresets: An iterable over featuresets, each a dict mapping 81 strings to either numbers, booleans or strings. 82 :return: The predicted class label for each input sample. 83 :rtype: list 84 """ 85 X = self._vectorizer.transform(featuresets) 86 classes = self._encoder.classes_ 87 return [classes[i] for i in self._clf.predict(X)] 88 89 def prob_classify_many(self, featuresets): 90 """Compute per-class probabilities for a batch of samples. 91 92 :param featuresets: An iterable over featuresets, each a dict mapping 93 strings to either numbers, booleans or strings. 94 :rtype: list of ``ProbDistI`` 95 """ 96 X = self._vectorizer.transform(featuresets) 97 y_proba_list = self._clf.predict_proba(X) 98 return [self._make_probdist(y_proba) for y_proba in y_proba_list] 99 100 def labels(self): 101 """The class labels used by this classifier. 102 103 :rtype: list 104 """ 105 return list(self._encoder.classes_) 106 107 def train(self, labeled_featuresets): 108 """ 109 Train (fit) the scikit-learn estimator. 110 111 :param labeled_featuresets: A list of ``(featureset, label)`` 112 where each ``featureset`` is a dict mapping strings to either 113 numbers, booleans or strings. 114 """ 115 116 X, y = list(zip(*labeled_featuresets)) 117 X = self._vectorizer.fit_transform(X) 118 y = self._encoder.fit_transform(y) 119 self._clf.fit(X, y) 120 121 return self 122 123 def _make_probdist(self, y_proba): 124 classes = self._encoder.classes_ 125 return DictionaryProbDist(dict((classes[i], p) for i, p in enumerate(y_proba))) 126 127 128# skip doctests if scikit-learn is not installed 129def setup_module(module): 130 from nose import SkipTest 131 132 try: 133 import sklearn 134 except ImportError: 135 raise SkipTest("scikit-learn is not installed") 136 137 138if __name__ == "__main__": 139 from nltk.classify.util import names_demo, names_demo_features 140 from sklearn.linear_model import LogisticRegression 141 from sklearn.naive_bayes import BernoulliNB 142 143 # Bernoulli Naive Bayes is designed for binary classification. We set the 144 # binarize option to False since we know we're passing boolean features. 145 print("scikit-learn Naive Bayes:") 146 names_demo( 147 SklearnClassifier(BernoulliNB(binarize=False)).train, 148 features=names_demo_features, 149 ) 150 151 # The C parameter on logistic regression (MaxEnt) controls regularization. 152 # The higher it's set, the less regularized the classifier is. 153 print("\n\nscikit-learn logistic regression:") 154 names_demo( 155 SklearnClassifier(LogisticRegression(C=1000)).train, 156 features=names_demo_features, 157 ) 158