1# Natural Language Toolkit: Interface to scikit-learn classifiers
2#
3# Author: Lars Buitinck <L.J.Buitinck@uva.nl>
4# URL: <http://nltk.org/>
5# For license information, see LICENSE.TXT
6"""
7scikit-learn (http://scikit-learn.org) is a machine learning library for
8Python. It supports many classification algorithms, including SVMs,
9Naive Bayes, logistic regression (MaxEnt) and decision trees.
10
11This package implements a wrapper around scikit-learn classifiers. To use this
12wrapper, construct a scikit-learn estimator object, then use that to construct
13a SklearnClassifier. E.g., to wrap a linear SVM with default settings:
14
15>>> from sklearn.svm import LinearSVC
16>>> from nltk.classify.scikitlearn import SklearnClassifier
17>>> classif = SklearnClassifier(LinearSVC())
18
19A scikit-learn classifier may include preprocessing steps when it's wrapped
20in a Pipeline object. The following constructs and wraps a Naive Bayes text
21classifier with tf-idf weighting and chi-square feature selection to get the
22best 1000 features:
23
24>>> from sklearn.feature_extraction.text import TfidfTransformer
25>>> from sklearn.feature_selection import SelectKBest, chi2
26>>> from sklearn.naive_bayes import MultinomialNB
27>>> from sklearn.pipeline import Pipeline
28>>> pipeline = Pipeline([('tfidf', TfidfTransformer()),
29...                      ('chi2', SelectKBest(chi2, k=1000)),
30...                      ('nb', MultinomialNB())])
31>>> classif = SklearnClassifier(pipeline)
32"""
33from __future__ import print_function, unicode_literals
34
35from six.moves import zip
36
37from nltk.classify.api import ClassifierI
38from nltk.probability import DictionaryProbDist
39from nltk import compat
40
41try:
42    from sklearn.feature_extraction import DictVectorizer
43    from sklearn.preprocessing import LabelEncoder
44except ImportError:
45    pass
46
47__all__ = ['SklearnClassifier']
48
49
50@compat.python_2_unicode_compatible
51class SklearnClassifier(ClassifierI):
52    """Wrapper for scikit-learn classifiers."""
53
54    def __init__(self, estimator, dtype=float, sparse=True):
55        """
56        :param estimator: scikit-learn classifier object.
57
58        :param dtype: data type used when building feature array.
59            scikit-learn estimators work exclusively on numeric data. The
60            default value should be fine for almost all situations.
61
62        :param sparse: Whether to use sparse matrices internally.
63            The estimator must support these; not all scikit-learn classifiers
64            do (see their respective documentation and look for "sparse
65            matrix"). The default value is True, since most NLP problems
66            involve sparse feature sets. Setting this to False may take a
67            great amount of memory.
68        :type sparse: boolean.
69        """
70        self._clf = estimator
71        self._encoder = LabelEncoder()
72        self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
73
74    def __repr__(self):
75        return "<SklearnClassifier(%r)>" % self._clf
76
77    def classify_many(self, featuresets):
78        """Classify a batch of samples.
79
80        :param featuresets: An iterable over featuresets, each a dict mapping
81            strings to either numbers, booleans or strings.
82        :return: The predicted class label for each input sample.
83        :rtype: list
84        """
85        X = self._vectorizer.transform(featuresets)
86        classes = self._encoder.classes_
87        return [classes[i] for i in self._clf.predict(X)]
88
89    def prob_classify_many(self, featuresets):
90        """Compute per-class probabilities for a batch of samples.
91
92        :param featuresets: An iterable over featuresets, each a dict mapping
93            strings to either numbers, booleans or strings.
94        :rtype: list of ``ProbDistI``
95        """
96        X = self._vectorizer.transform(featuresets)
97        y_proba_list = self._clf.predict_proba(X)
98        return [self._make_probdist(y_proba) for y_proba in y_proba_list]
99
100    def labels(self):
101        """The class labels used by this classifier.
102
103        :rtype: list
104        """
105        return list(self._encoder.classes_)
106
107    def train(self, labeled_featuresets):
108        """
109        Train (fit) the scikit-learn estimator.
110
111        :param labeled_featuresets: A list of ``(featureset, label)``
112            where each ``featureset`` is a dict mapping strings to either
113            numbers, booleans or strings.
114        """
115
116        X, y = list(zip(*labeled_featuresets))
117        X = self._vectorizer.fit_transform(X)
118        y = self._encoder.fit_transform(y)
119        self._clf.fit(X, y)
120
121        return self
122
123    def _make_probdist(self, y_proba):
124        classes = self._encoder.classes_
125        return DictionaryProbDist(dict((classes[i], p) for i, p in enumerate(y_proba)))
126
127
128# skip doctests if scikit-learn is not installed
129def setup_module(module):
130    from nose import SkipTest
131
132    try:
133        import sklearn
134    except ImportError:
135        raise SkipTest("scikit-learn is not installed")
136
137
138if __name__ == "__main__":
139    from nltk.classify.util import names_demo, names_demo_features
140    from sklearn.linear_model import LogisticRegression
141    from sklearn.naive_bayes import BernoulliNB
142
143    # Bernoulli Naive Bayes is designed for binary classification. We set the
144    # binarize option to False since we know we're passing boolean features.
145    print("scikit-learn Naive Bayes:")
146    names_demo(
147        SklearnClassifier(BernoulliNB(binarize=False)).train,
148        features=names_demo_features,
149    )
150
151    # The C parameter on logistic regression (MaxEnt) controls regularization.
152    # The higher it's set, the less regularized the classifier is.
153    print("\n\nscikit-learn logistic regression:")
154    names_demo(
155        SklearnClassifier(LogisticRegression(C=1000)).train,
156        features=names_demo_features,
157    )
158