1# coding: utf-8
2#
3# Natural Language Toolkit: Sentiment Analyzer
4#
5# Copyright (C) 2001-2019 NLTK Project
6# Author: Pierpaolo Pantone <24alsecondo@gmail.com>
7# URL: <http://nltk.org/>
8# For license information, see LICENSE.TXT
9
10"""
11Utility methods for Sentiment Analysis.
12"""
13from __future__ import division
14
15import codecs
16import csv
17import json
18import pickle
19import random
20import re
21import sys
22import time
23from copy import deepcopy
24from itertools import tee
25
26import nltk
27from nltk.corpus import CategorizedPlaintextCorpusReader
28from nltk.data import load
29from nltk.tokenize.casual import EMOTICON_RE
30
31# ////////////////////////////////////////////////////////////
32# { Regular expressions
33# ////////////////////////////////////////////////////////////
34
35# Regular expression for negation by Christopher Potts
36NEGATION = r"""
37    (?:
38        ^(?:never|no|nothing|nowhere|noone|none|not|
39            havent|hasnt|hadnt|cant|couldnt|shouldnt|
40            wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
41        )$
42    )
43    |
44    n't"""
45
46NEGATION_RE = re.compile(NEGATION, re.VERBOSE)
47
48CLAUSE_PUNCT = r'^[.:;!?]$'
49CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT)
50
51# Happy and sad emoticons
52
53HAPPY = set(
54    [
55        ':-)',
56        ':)',
57        ';)',
58        ':o)',
59        ':]',
60        ':3',
61        ':c)',
62        ':>',
63        '=]',
64        '8)',
65        '=)',
66        ':}',
67        ':^)',
68        ':-D',
69        ':D',
70        '8-D',
71        '8D',
72        'x-D',
73        'xD',
74        'X-D',
75        'XD',
76        '=-D',
77        '=D',
78        '=-3',
79        '=3',
80        ':-))',
81        ":'-)",
82        ":')",
83        ':*',
84        ':^*',
85        '>:P',
86        ':-P',
87        ':P',
88        'X-P',
89        'x-p',
90        'xp',
91        'XP',
92        ':-p',
93        ':p',
94        '=p',
95        ':-b',
96        ':b',
97        '>:)',
98        '>;)',
99        '>:-)',
100        '<3',
101    ]
102)
103
104SAD = set(
105    [
106        ':L',
107        ':-/',
108        '>:/',
109        ':S',
110        '>:[',
111        ':@',
112        ':-(',
113        ':[',
114        ':-||',
115        '=L',
116        ':<',
117        ':-[',
118        ':-<',
119        '=\\',
120        '=/',
121        '>:(',
122        ':(',
123        '>.<',
124        ":'-(",
125        ":'(",
126        ':\\',
127        ':-c',
128        ':c',
129        ':{',
130        '>:\\',
131        ';(',
132    ]
133)
134
135
136def timer(method):
137    """
138    A timer decorator to measure execution performance of methods.
139    """
140
141    def timed(*args, **kw):
142        start = time.time()
143        result = method(*args, **kw)
144        end = time.time()
145        tot_time = end - start
146        hours = tot_time // 3600
147        mins = tot_time // 60 % 60
148        # in Python 2.x round() will return a float, so we convert it to int
149        secs = int(round(tot_time % 60))
150        if hours == 0 and mins == 0 and secs < 10:
151            print('[TIMER] {0}(): {:.3f} seconds'.format(method.__name__, tot_time))
152        else:
153            print(
154                '[TIMER] {0}(): {1}h {2}m {3}s'.format(
155                    method.__name__, hours, mins, secs
156                )
157            )
158        return result
159
160    return timed
161
162
163def pairwise(iterable):
164    """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
165    a, b = tee(iterable)
166    next(b, None)
167    return zip(a, b)
168
169
170# ////////////////////////////////////////////////////////////
171# { Feature extractor functions
172# ////////////////////////////////////////////////////////////
173"""
174Feature extractor functions are declared outside the SentimentAnalyzer class.
175Users should have the possibility to create their own feature extractors
176without modifying SentimentAnalyzer.
177"""
178
179
180def extract_unigram_feats(document, unigrams, handle_negation=False):
181    """
182    Populate a dictionary of unigram features, reflecting the presence/absence in
183    the document of each of the tokens in `unigrams`.
184
185    :param document: a list of words/tokens.
186    :param unigrams: a list of words/tokens whose presence/absence has to be
187        checked in `document`.
188    :param handle_negation: if `handle_negation == True` apply `mark_negation`
189        method to `document` before checking for unigram presence/absence.
190    :return: a dictionary of unigram features {unigram : boolean}.
191
192    >>> words = ['ice', 'police', 'riot']
193    >>> document = 'ice is melting due to global warming'.split()
194    >>> sorted(extract_unigram_feats(document, words).items())
195    [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)]
196    """
197    features = {}
198    if handle_negation:
199        document = mark_negation(document)
200    for word in unigrams:
201        features['contains({0})'.format(word)] = word in set(document)
202    return features
203
204
205def extract_bigram_feats(document, bigrams):
206    """
207    Populate a dictionary of bigram features, reflecting the presence/absence in
208    the document of each of the tokens in `bigrams`. This extractor function only
209    considers contiguous bigrams obtained by `nltk.bigrams`.
210
211    :param document: a list of words/tokens.
212    :param unigrams: a list of bigrams whose presence/absence has to be
213        checked in `document`.
214    :return: a dictionary of bigram features {bigram : boolean}.
215
216    >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
217    >>> document = 'ice is melting due to global warming'.split()
218    >>> sorted(extract_bigram_feats(document, bigrams).items())
219    [('contains(global - warming)', True), ('contains(love - you)', False),
220    ('contains(police - prevented)', False)]
221    """
222    features = {}
223    for bigr in bigrams:
224        features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(
225            document
226        )
227    return features
228
229
230# ////////////////////////////////////////////////////////////
231# { Helper Functions
232# ////////////////////////////////////////////////////////////
233
234
235def mark_negation(document, double_neg_flip=False, shallow=False):
236    """
237    Append _NEG suffix to words that appear in the scope between a negation
238    and a punctuation mark.
239
240    :param document: a list of words/tokens, or a tuple (words, label).
241    :param shallow: if True, the method will modify the original document in place.
242    :param double_neg_flip: if True, double negation is considered affirmation
243        (we activate/deactivate negation scope everytime we find a negation).
244    :return: if `shallow == True` the method will modify the original document
245        and return it. If `shallow == False` the method will return a modified
246        document, leaving the original unmodified.
247
248    >>> sent = "I didn't like this movie . It was bad .".split()
249    >>> mark_negation(sent)
250    ['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.']
251    """
252    if not shallow:
253        document = deepcopy(document)
254    # check if the document is labeled. If so, do not consider the label.
255    labeled = document and isinstance(document[0], (tuple, list))
256    if labeled:
257        doc = document[0]
258    else:
259        doc = document
260    neg_scope = False
261    for i, word in enumerate(doc):
262        if NEGATION_RE.search(word):
263            if not neg_scope or (neg_scope and double_neg_flip):
264                neg_scope = not neg_scope
265                continue
266            else:
267                doc[i] += '_NEG'
268        elif neg_scope and CLAUSE_PUNCT_RE.search(word):
269            neg_scope = not neg_scope
270        elif neg_scope and not CLAUSE_PUNCT_RE.search(word):
271            doc[i] += '_NEG'
272
273    return document
274
275
276def output_markdown(filename, **kwargs):
277    """
278    Write the output of an analysis to a file.
279    """
280    with codecs.open(filename, 'at') as outfile:
281        text = '\n*** \n\n'
282        text += '{0} \n\n'.format(time.strftime("%d/%m/%Y, %H:%M"))
283        for k in sorted(kwargs):
284            if isinstance(kwargs[k], dict):
285                dictionary = kwargs[k]
286                text += '  - **{0}:**\n'.format(k)
287                for entry in sorted(dictionary):
288                    text += '    - {0}: {1} \n'.format(entry, dictionary[entry])
289            elif isinstance(kwargs[k], list):
290                text += '  - **{0}:**\n'.format(k)
291                for entry in kwargs[k]:
292                    text += '    - {0}\n'.format(entry)
293            else:
294                text += '  - **{0}:** {1} \n'.format(k, kwargs[k])
295        outfile.write(text)
296
297
298def save_file(content, filename):
299    """
300    Store `content` in `filename`. Can be used to store a SentimentAnalyzer.
301    """
302    print("Saving", filename)
303    with codecs.open(filename, 'wb') as storage_file:
304        # The protocol=2 parameter is for python2 compatibility
305        pickle.dump(content, storage_file, protocol=2)
306
307
308def split_train_test(all_instances, n=None):
309    """
310    Randomly split `n` instances of the dataset into train and test sets.
311
312    :param all_instances: a list of instances (e.g. documents) that will be split.
313    :param n: the number of instances to consider (in case we want to use only a
314        subset).
315    :return: two lists of instances. Train set is 8/10 of the total and test set
316        is 2/10 of the total.
317    """
318    random.seed(12345)
319    random.shuffle(all_instances)
320    if not n or n > len(all_instances):
321        n = len(all_instances)
322    train_set = all_instances[: int(0.8 * n)]
323    test_set = all_instances[int(0.8 * n) : n]
324
325    return train_set, test_set
326
327
328def _show_plot(x_values, y_values, x_labels=None, y_labels=None):
329    try:
330        import matplotlib.pyplot as plt
331    except ImportError:
332        raise ImportError(
333            'The plot function requires matplotlib to be installed.'
334            'See http://matplotlib.org/'
335        )
336
337    plt.locator_params(axis='y', nbins=3)
338    axes = plt.axes()
339    axes.yaxis.grid()
340    plt.plot(x_values, y_values, 'ro', color='red')
341    plt.ylim(ymin=-1.2, ymax=1.2)
342    plt.tight_layout(pad=5)
343    if x_labels:
344        plt.xticks(x_values, x_labels, rotation='vertical')
345    if y_labels:
346        plt.yticks([-1, 0, 1], y_labels, rotation='horizontal')
347    # Pad margins so that markers are not clipped by the axes
348    plt.margins(0.2)
349    plt.show()
350
351
352# ////////////////////////////////////////////////////////////
353# { Parsing and conversion functions
354# ////////////////////////////////////////////////////////////
355
356
357def json2csv_preprocess(
358    json_file,
359    outfile,
360    fields,
361    encoding='utf8',
362    errors='replace',
363    gzip_compress=False,
364    skip_retweets=True,
365    skip_tongue_tweets=True,
366    skip_ambiguous_tweets=True,
367    strip_off_emoticons=True,
368    remove_duplicates=True,
369    limit=None,
370):
371    """
372    Convert json file to csv file, preprocessing each row to obtain a suitable
373    dataset for tweets Semantic Analysis.
374
375    :param json_file: the original json file containing tweets.
376    :param outfile: the output csv filename.
377    :param fields: a list of fields that will be extracted from the json file and
378        kept in the output csv file.
379    :param encoding: the encoding of the files.
380    :param errors: the error handling strategy for the output writer.
381    :param gzip_compress: if True, create a compressed GZIP file.
382
383    :param skip_retweets: if True, remove retweets.
384    :param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P"
385        emoticons.
386    :param skip_ambiguous_tweets: if True, remove tweets containing both happy
387        and sad emoticons.
388    :param strip_off_emoticons: if True, strip off emoticons from all tweets.
389    :param remove_duplicates: if True, remove tweets appearing more than once.
390    :param limit: an integer to set the number of tweets to convert. After the
391        limit is reached the conversion will stop. It can be useful to create
392        subsets of the original tweets json data.
393    """
394    with codecs.open(json_file, encoding=encoding) as fp:
395        (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress)
396        # write the list of fields as header
397        writer.writerow(fields)
398
399        if remove_duplicates == True:
400            tweets_cache = []
401        i = 0
402        for line in fp:
403            tweet = json.loads(line)
404            row = extract_fields(tweet, fields)
405            try:
406                text = row[fields.index('text')]
407                # Remove retweets
408                if skip_retweets == True:
409                    if re.search(r'\bRT\b', text):
410                        continue
411                # Remove tweets containing ":P" and ":-P" emoticons
412                if skip_tongue_tweets == True:
413                    if re.search(r'\:\-?P\b', text):
414                        continue
415                # Remove tweets containing both happy and sad emoticons
416                if skip_ambiguous_tweets == True:
417                    all_emoticons = EMOTICON_RE.findall(text)
418                    if all_emoticons:
419                        if (set(all_emoticons) & HAPPY) and (set(all_emoticons) & SAD):
420                            continue
421                # Strip off emoticons from all tweets
422                if strip_off_emoticons == True:
423                    row[fields.index('text')] = re.sub(
424                        r'(?!\n)\s+', ' ', EMOTICON_RE.sub('', text)
425                    )
426                # Remove duplicate tweets
427                if remove_duplicates == True:
428                    if row[fields.index('text')] in tweets_cache:
429                        continue
430                    else:
431                        tweets_cache.append(row[fields.index('text')])
432            except ValueError:
433                pass
434            writer.writerow(row)
435            i += 1
436            if limit and i >= limit:
437                break
438        outf.close()
439
440
441def parse_tweets_set(
442    filename, label, word_tokenizer=None, sent_tokenizer=None, skip_header=True
443):
444    """
445    Parse csv file containing tweets and output data a list of (text, label) tuples.
446
447    :param filename: the input csv filename.
448    :param label: the label to be appended to each tweet contained in the csv file.
449    :param word_tokenizer: the tokenizer instance that will be used to tokenize
450        each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()).
451        If no word_tokenizer is specified, tweets will not be tokenized.
452    :param sent_tokenizer: the tokenizer that will be used to split each tweet into
453        sentences.
454    :param skip_header: if True, skip the first line of the csv file (which usually
455        contains headers).
456
457    :return: a list of (text, label) tuples.
458    """
459    tweets = []
460    if not sent_tokenizer:
461        sent_tokenizer = load('tokenizers/punkt/english.pickle')
462
463    # If we use Python3.x we can proceed using the 'rt' flag
464    if sys.version_info[0] == 3:
465        with codecs.open(filename, 'rt') as csvfile:
466            reader = csv.reader(csvfile)
467            if skip_header == True:
468                next(reader, None)  # skip the header
469            i = 0
470            for tweet_id, text in reader:
471                # text = text[1]
472                i += 1
473                sys.stdout.write('Loaded {0} tweets\r'.format(i))
474                # Apply sentence and word tokenizer to text
475                if word_tokenizer:
476                    tweet = [
477                        w
478                        for sent in sent_tokenizer.tokenize(text)
479                        for w in word_tokenizer.tokenize(sent)
480                    ]
481                else:
482                    tweet = text
483                tweets.append((tweet, label))
484    # If we use Python2.x we need to handle encoding problems
485    elif sys.version_info[0] < 3:
486        with codecs.open(filename) as csvfile:
487            reader = csv.reader(csvfile)
488            if skip_header == True:
489                next(reader, None)  # skip the header
490            i = 0
491            for row in reader:
492                unicode_row = [x.decode('utf8') for x in row]
493                text = unicode_row[1]
494                i += 1
495                sys.stdout.write('Loaded {0} tweets\r'.format(i))
496                # Apply sentence and word tokenizer to text
497                if word_tokenizer:
498                    tweet = [
499                        w.encode('utf8')
500                        for sent in sent_tokenizer.tokenize(text)
501                        for w in word_tokenizer.tokenize(sent)
502                    ]
503                else:
504                    tweet = text
505                tweets.append((tweet, label))
506    print("Loaded {0} tweets".format(i))
507    return tweets
508
509
510# ////////////////////////////////////////////////////////////
511# { Demos
512# ////////////////////////////////////////////////////////////
513
514
515def demo_tweets(trainer, n_instances=None, output=None):
516    """
517    Train and test Naive Bayes classifier on 10000 tweets, tokenized using
518    TweetTokenizer.
519    Features are composed of:
520        - 1000 most frequent unigrams
521        - 100 top bigrams (using BigramAssocMeasures.pmi)
522
523    :param trainer: `train` method of a classifier.
524    :param n_instances: the number of total tweets that have to be used for
525        training and testing. Tweets will be equally split between positive and
526        negative.
527    :param output: the output file where results have to be reported.
528    """
529    from nltk.tokenize import TweetTokenizer
530    from nltk.sentiment import SentimentAnalyzer
531    from nltk.corpus import twitter_samples, stopwords
532
533    # Different customizations for the TweetTokenizer
534    tokenizer = TweetTokenizer(preserve_case=False)
535    # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True)
536    # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
537
538    if n_instances is not None:
539        n_instances = int(n_instances / 2)
540
541    fields = ['id', 'text']
542    positive_json = twitter_samples.abspath("positive_tweets.json")
543    positive_csv = 'positive_tweets.csv'
544    json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances)
545
546    negative_json = twitter_samples.abspath("negative_tweets.json")
547    negative_csv = 'negative_tweets.csv'
548    json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances)
549
550    neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer)
551    pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer)
552
553    # We separately split subjective and objective instances to keep a balanced
554    # uniform class distribution in both train and test sets.
555    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
556    train_neg_docs, test_neg_docs = split_train_test(neg_docs)
557
558    training_tweets = train_pos_docs + train_neg_docs
559    testing_tweets = test_pos_docs + test_neg_docs
560
561    sentim_analyzer = SentimentAnalyzer()
562    # stopwords = stopwords.words('english')
563    # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords]
564    all_words = [word for word in sentim_analyzer.all_words(training_tweets)]
565
566    # Add simple unigram word features
567    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000)
568    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
569
570    # Add bigram collocation features
571    bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats(
572        [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12
573    )
574    sentim_analyzer.add_feat_extractor(
575        extract_bigram_feats, bigrams=bigram_collocs_feats
576    )
577
578    training_set = sentim_analyzer.apply_features(training_tweets)
579    test_set = sentim_analyzer.apply_features(testing_tweets)
580
581    classifier = sentim_analyzer.train(trainer, training_set)
582    # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4)
583    try:
584        classifier.show_most_informative_features()
585    except AttributeError:
586        print(
587            'Your classifier does not provide a show_most_informative_features() method.'
588        )
589    results = sentim_analyzer.evaluate(test_set)
590
591    if output:
592        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
593        output_markdown(
594            output,
595            Dataset='labeled_tweets',
596            Classifier=type(classifier).__name__,
597            Tokenizer=tokenizer.__class__.__name__,
598            Feats=extr,
599            Results=results,
600            Instances=n_instances,
601        )
602
603
604def demo_movie_reviews(trainer, n_instances=None, output=None):
605    """
606    Train classifier on all instances of the Movie Reviews dataset.
607    The corpus has been preprocessed using the default sentence tokenizer and
608    WordPunctTokenizer.
609    Features are composed of:
610        - most frequent unigrams
611
612    :param trainer: `train` method of a classifier.
613    :param n_instances: the number of total reviews that have to be used for
614        training and testing. Reviews will be equally split between positive and
615        negative.
616    :param output: the output file where results have to be reported.
617    """
618    from nltk.corpus import movie_reviews
619    from nltk.sentiment import SentimentAnalyzer
620
621    if n_instances is not None:
622        n_instances = int(n_instances / 2)
623
624    pos_docs = [
625        (list(movie_reviews.words(pos_id)), 'pos')
626        for pos_id in movie_reviews.fileids('pos')[:n_instances]
627    ]
628    neg_docs = [
629        (list(movie_reviews.words(neg_id)), 'neg')
630        for neg_id in movie_reviews.fileids('neg')[:n_instances]
631    ]
632    # We separately split positive and negative instances to keep a balanced
633    # uniform class distribution in both train and test sets.
634    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
635    train_neg_docs, test_neg_docs = split_train_test(neg_docs)
636
637    training_docs = train_pos_docs + train_neg_docs
638    testing_docs = test_pos_docs + test_neg_docs
639
640    sentim_analyzer = SentimentAnalyzer()
641    all_words = sentim_analyzer.all_words(training_docs)
642
643    # Add simple unigram word features
644    unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4)
645    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
646    # Apply features to obtain a feature-value representation of our datasets
647    training_set = sentim_analyzer.apply_features(training_docs)
648    test_set = sentim_analyzer.apply_features(testing_docs)
649
650    classifier = sentim_analyzer.train(trainer, training_set)
651    try:
652        classifier.show_most_informative_features()
653    except AttributeError:
654        print(
655            'Your classifier does not provide a show_most_informative_features() method.'
656        )
657    results = sentim_analyzer.evaluate(test_set)
658
659    if output:
660        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
661        output_markdown(
662            output,
663            Dataset='Movie_reviews',
664            Classifier=type(classifier).__name__,
665            Tokenizer='WordPunctTokenizer',
666            Feats=extr,
667            Results=results,
668            Instances=n_instances,
669        )
670
671
672def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
673    """
674    Train and test a classifier on instances of the Subjective Dataset by Pang and
675    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
676    All tokens (words and punctuation marks) are separated by a whitespace, so
677    we use the basic WhitespaceTokenizer to parse the data.
678
679    :param trainer: `train` method of a classifier.
680    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
681    :param n_instances: the number of total sentences that have to be used for
682        training and testing. Sentences will be equally split between positive
683        and negative.
684    :param output: the output file where results have to be reported.
685    """
686    from nltk.sentiment import SentimentAnalyzer
687    from nltk.corpus import subjectivity
688
689    if n_instances is not None:
690        n_instances = int(n_instances / 2)
691
692    subj_docs = [
693        (sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]
694    ]
695    obj_docs = [
696        (sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]
697    ]
698
699    # We separately split subjective and objective instances to keep a balanced
700    # uniform class distribution in both train and test sets.
701    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
702    train_obj_docs, test_obj_docs = split_train_test(obj_docs)
703
704    training_docs = train_subj_docs + train_obj_docs
705    testing_docs = test_subj_docs + test_obj_docs
706
707    sentim_analyzer = SentimentAnalyzer()
708    all_words_neg = sentim_analyzer.all_words(
709        [mark_negation(doc) for doc in training_docs]
710    )
711
712    # Add simple unigram word features handling negation
713    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
714    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
715
716    # Apply features to obtain a feature-value representation of our datasets
717    training_set = sentim_analyzer.apply_features(training_docs)
718    test_set = sentim_analyzer.apply_features(testing_docs)
719
720    classifier = sentim_analyzer.train(trainer, training_set)
721    try:
722        classifier.show_most_informative_features()
723    except AttributeError:
724        print(
725            'Your classifier does not provide a show_most_informative_features() method.'
726        )
727    results = sentim_analyzer.evaluate(test_set)
728
729    if save_analyzer == True:
730        save_file(sentim_analyzer, 'sa_subjectivity.pickle')
731
732    if output:
733        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
734        output_markdown(
735            output,
736            Dataset='subjectivity',
737            Classifier=type(classifier).__name__,
738            Tokenizer='WhitespaceTokenizer',
739            Feats=extr,
740            Instances=n_instances,
741            Results=results,
742        )
743
744    return sentim_analyzer
745
746
747def demo_sent_subjectivity(text):
748    """
749    Classify a single sentence as subjective or objective using a stored
750    SentimentAnalyzer.
751
752    :param text: a sentence whose subjectivity has to be classified.
753    """
754    from nltk.classify import NaiveBayesClassifier
755    from nltk.tokenize import regexp
756
757    word_tokenizer = regexp.WhitespaceTokenizer()
758    try:
759        sentim_analyzer = load('sa_subjectivity.pickle')
760    except LookupError:
761        print('Cannot find the sentiment analyzer you want to load.')
762        print('Training a new one using NaiveBayesClassifier.')
763        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)
764
765    # Tokenize and convert to lower case
766    tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)]
767    print(sentim_analyzer.classify(tokenized_text))
768
769
770def demo_liu_hu_lexicon(sentence, plot=False):
771    """
772    Basic example of sentiment classification using Liu and Hu opinion lexicon.
773    This function simply counts the number of positive, negative and neutral words
774    in the sentence and classifies it depending on which polarity is more represented.
775    Words that do not appear in the lexicon are considered as neutral.
776
777    :param sentence: a sentence whose polarity has to be classified.
778    :param plot: if True, plot a visual representation of the sentence polarity.
779    """
780    from nltk.corpus import opinion_lexicon
781    from nltk.tokenize import treebank
782
783    tokenizer = treebank.TreebankWordTokenizer()
784    pos_words = 0
785    neg_words = 0
786    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]
787
788    x = list(range(len(tokenized_sent)))  # x axis for the plot
789    y = []
790
791    for word in tokenized_sent:
792        if word in opinion_lexicon.positive():
793            pos_words += 1
794            y.append(1)  # positive
795        elif word in opinion_lexicon.negative():
796            neg_words += 1
797            y.append(-1)  # negative
798        else:
799            y.append(0)  # neutral
800
801    if pos_words > neg_words:
802        print('Positive')
803    elif pos_words < neg_words:
804        print('Negative')
805    elif pos_words == neg_words:
806        print('Neutral')
807
808    if plot == True:
809        _show_plot(
810            x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive']
811        )
812
813
814def demo_vader_instance(text):
815    """
816    Output polarity scores for a text using Vader approach.
817
818    :param text: a text whose polarity has to be evaluated.
819    """
820    from nltk.sentiment import SentimentIntensityAnalyzer
821
822    vader_analyzer = SentimentIntensityAnalyzer()
823    print(vader_analyzer.polarity_scores(text))
824
825
826def demo_vader_tweets(n_instances=None, output=None):
827    """
828    Classify 10000 positive and negative tweets using Vader approach.
829
830    :param n_instances: the number of total tweets that have to be classified.
831    :param output: the output file where results have to be reported.
832    """
833    from collections import defaultdict
834    from nltk.corpus import twitter_samples
835    from nltk.sentiment import SentimentIntensityAnalyzer
836    from nltk.metrics import (
837        accuracy as eval_accuracy,
838        precision as eval_precision,
839        recall as eval_recall,
840        f_measure as eval_f_measure,
841    )
842
843    if n_instances is not None:
844        n_instances = int(n_instances / 2)
845
846    fields = ['id', 'text']
847    positive_json = twitter_samples.abspath("positive_tweets.json")
848    positive_csv = 'positive_tweets.csv'
849    json2csv_preprocess(
850        positive_json,
851        positive_csv,
852        fields,
853        strip_off_emoticons=False,
854        limit=n_instances,
855    )
856
857    negative_json = twitter_samples.abspath("negative_tweets.json")
858    negative_csv = 'negative_tweets.csv'
859    json2csv_preprocess(
860        negative_json,
861        negative_csv,
862        fields,
863        strip_off_emoticons=False,
864        limit=n_instances,
865    )
866
867    pos_docs = parse_tweets_set(positive_csv, label='pos')
868    neg_docs = parse_tweets_set(negative_csv, label='neg')
869
870    # We separately split subjective and objective instances to keep a balanced
871    # uniform class distribution in both train and test sets.
872    train_pos_docs, test_pos_docs = split_train_test(pos_docs)
873    train_neg_docs, test_neg_docs = split_train_test(neg_docs)
874
875    training_tweets = train_pos_docs + train_neg_docs
876    testing_tweets = test_pos_docs + test_neg_docs
877
878    vader_analyzer = SentimentIntensityAnalyzer()
879
880    gold_results = defaultdict(set)
881    test_results = defaultdict(set)
882    acc_gold_results = []
883    acc_test_results = []
884    labels = set()
885    num = 0
886    for i, (text, label) in enumerate(testing_tweets):
887        labels.add(label)
888        gold_results[label].add(i)
889        acc_gold_results.append(label)
890        score = vader_analyzer.polarity_scores(text)['compound']
891        if score > 0:
892            observed = 'pos'
893        else:
894            observed = 'neg'
895        num += 1
896        acc_test_results.append(observed)
897        test_results[observed].add(i)
898    metrics_results = {}
899    for label in labels:
900        accuracy_score = eval_accuracy(acc_gold_results, acc_test_results)
901        metrics_results['Accuracy'] = accuracy_score
902        precision_score = eval_precision(gold_results[label], test_results[label])
903        metrics_results['Precision [{0}]'.format(label)] = precision_score
904        recall_score = eval_recall(gold_results[label], test_results[label])
905        metrics_results['Recall [{0}]'.format(label)] = recall_score
906        f_measure_score = eval_f_measure(gold_results[label], test_results[label])
907        metrics_results['F-measure [{0}]'.format(label)] = f_measure_score
908
909    for result in sorted(metrics_results):
910        print('{0}: {1}'.format(result, metrics_results[result]))
911
912    if output:
913        output_markdown(
914            output,
915            Approach='Vader',
916            Dataset='labeled_tweets',
917            Instances=n_instances,
918            Results=metrics_results,
919        )
920
921
922if __name__ == '__main__':
923    from nltk.classify import NaiveBayesClassifier, MaxentClassifier
924    from nltk.classify.scikitlearn import SklearnClassifier
925    from sklearn.svm import LinearSVC
926    from nltk.twitter.common import outf_writer_compat, extract_fields
927
928    naive_bayes = NaiveBayesClassifier.train
929    svm = SklearnClassifier(LinearSVC()).train
930    maxent = MaxentClassifier.train
931
932    demo_tweets(naive_bayes)
933    # demo_movie_reviews(svm)
934    # demo_subjectivity(svm)
935    # demo_sent_subjectivity("she's an artist , but hasn't picked up a brush in a year . ")
936    # demo_liu_hu_lexicon("This movie was actually neither that funny, nor super witty.", plot=True)
937    # demo_vader_instance("This movie was actually neither that funny, nor super witty.")
938    # demo_vader_tweets()
939