1# coding: utf-8 2# 3# Natural Language Toolkit: Sentiment Analyzer 4# 5# Copyright (C) 2001-2019 NLTK Project 6# Author: Pierpaolo Pantone <24alsecondo@gmail.com> 7# URL: <http://nltk.org/> 8# For license information, see LICENSE.TXT 9 10""" 11Utility methods for Sentiment Analysis. 12""" 13from __future__ import division 14 15import codecs 16import csv 17import json 18import pickle 19import random 20import re 21import sys 22import time 23from copy import deepcopy 24from itertools import tee 25 26import nltk 27from nltk.corpus import CategorizedPlaintextCorpusReader 28from nltk.data import load 29from nltk.tokenize.casual import EMOTICON_RE 30 31# //////////////////////////////////////////////////////////// 32# { Regular expressions 33# //////////////////////////////////////////////////////////// 34 35# Regular expression for negation by Christopher Potts 36NEGATION = r""" 37 (?: 38 ^(?:never|no|nothing|nowhere|noone|none|not| 39 havent|hasnt|hadnt|cant|couldnt|shouldnt| 40 wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint 41 )$ 42 ) 43 | 44 n't""" 45 46NEGATION_RE = re.compile(NEGATION, re.VERBOSE) 47 48CLAUSE_PUNCT = r'^[.:;!?]$' 49CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT) 50 51# Happy and sad emoticons 52 53HAPPY = set( 54 [ 55 ':-)', 56 ':)', 57 ';)', 58 ':o)', 59 ':]', 60 ':3', 61 ':c)', 62 ':>', 63 '=]', 64 '8)', 65 '=)', 66 ':}', 67 ':^)', 68 ':-D', 69 ':D', 70 '8-D', 71 '8D', 72 'x-D', 73 'xD', 74 'X-D', 75 'XD', 76 '=-D', 77 '=D', 78 '=-3', 79 '=3', 80 ':-))', 81 ":'-)", 82 ":')", 83 ':*', 84 ':^*', 85 '>:P', 86 ':-P', 87 ':P', 88 'X-P', 89 'x-p', 90 'xp', 91 'XP', 92 ':-p', 93 ':p', 94 '=p', 95 ':-b', 96 ':b', 97 '>:)', 98 '>;)', 99 '>:-)', 100 '<3', 101 ] 102) 103 104SAD = set( 105 [ 106 ':L', 107 ':-/', 108 '>:/', 109 ':S', 110 '>:[', 111 ':@', 112 ':-(', 113 ':[', 114 ':-||', 115 '=L', 116 ':<', 117 ':-[', 118 ':-<', 119 '=\\', 120 '=/', 121 '>:(', 122 ':(', 123 '>.<', 124 ":'-(", 125 ":'(", 126 ':\\', 127 ':-c', 128 ':c', 129 ':{', 130 '>:\\', 131 ';(', 132 ] 133) 134 135 136def timer(method): 137 """ 138 A timer decorator to measure execution performance of methods. 139 """ 140 141 def timed(*args, **kw): 142 start = time.time() 143 result = method(*args, **kw) 144 end = time.time() 145 tot_time = end - start 146 hours = tot_time // 3600 147 mins = tot_time // 60 % 60 148 # in Python 2.x round() will return a float, so we convert it to int 149 secs = int(round(tot_time % 60)) 150 if hours == 0 and mins == 0 and secs < 10: 151 print('[TIMER] {0}(): {:.3f} seconds'.format(method.__name__, tot_time)) 152 else: 153 print( 154 '[TIMER] {0}(): {1}h {2}m {3}s'.format( 155 method.__name__, hours, mins, secs 156 ) 157 ) 158 return result 159 160 return timed 161 162 163def pairwise(iterable): 164 """s -> (s0,s1), (s1,s2), (s2, s3), ...""" 165 a, b = tee(iterable) 166 next(b, None) 167 return zip(a, b) 168 169 170# //////////////////////////////////////////////////////////// 171# { Feature extractor functions 172# //////////////////////////////////////////////////////////// 173""" 174Feature extractor functions are declared outside the SentimentAnalyzer class. 175Users should have the possibility to create their own feature extractors 176without modifying SentimentAnalyzer. 177""" 178 179 180def extract_unigram_feats(document, unigrams, handle_negation=False): 181 """ 182 Populate a dictionary of unigram features, reflecting the presence/absence in 183 the document of each of the tokens in `unigrams`. 184 185 :param document: a list of words/tokens. 186 :param unigrams: a list of words/tokens whose presence/absence has to be 187 checked in `document`. 188 :param handle_negation: if `handle_negation == True` apply `mark_negation` 189 method to `document` before checking for unigram presence/absence. 190 :return: a dictionary of unigram features {unigram : boolean}. 191 192 >>> words = ['ice', 'police', 'riot'] 193 >>> document = 'ice is melting due to global warming'.split() 194 >>> sorted(extract_unigram_feats(document, words).items()) 195 [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)] 196 """ 197 features = {} 198 if handle_negation: 199 document = mark_negation(document) 200 for word in unigrams: 201 features['contains({0})'.format(word)] = word in set(document) 202 return features 203 204 205def extract_bigram_feats(document, bigrams): 206 """ 207 Populate a dictionary of bigram features, reflecting the presence/absence in 208 the document of each of the tokens in `bigrams`. This extractor function only 209 considers contiguous bigrams obtained by `nltk.bigrams`. 210 211 :param document: a list of words/tokens. 212 :param unigrams: a list of bigrams whose presence/absence has to be 213 checked in `document`. 214 :return: a dictionary of bigram features {bigram : boolean}. 215 216 >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')] 217 >>> document = 'ice is melting due to global warming'.split() 218 >>> sorted(extract_bigram_feats(document, bigrams).items()) 219 [('contains(global - warming)', True), ('contains(love - you)', False), 220 ('contains(police - prevented)', False)] 221 """ 222 features = {} 223 for bigr in bigrams: 224 features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams( 225 document 226 ) 227 return features 228 229 230# //////////////////////////////////////////////////////////// 231# { Helper Functions 232# //////////////////////////////////////////////////////////// 233 234 235def mark_negation(document, double_neg_flip=False, shallow=False): 236 """ 237 Append _NEG suffix to words that appear in the scope between a negation 238 and a punctuation mark. 239 240 :param document: a list of words/tokens, or a tuple (words, label). 241 :param shallow: if True, the method will modify the original document in place. 242 :param double_neg_flip: if True, double negation is considered affirmation 243 (we activate/deactivate negation scope everytime we find a negation). 244 :return: if `shallow == True` the method will modify the original document 245 and return it. If `shallow == False` the method will return a modified 246 document, leaving the original unmodified. 247 248 >>> sent = "I didn't like this movie . It was bad .".split() 249 >>> mark_negation(sent) 250 ['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.'] 251 """ 252 if not shallow: 253 document = deepcopy(document) 254 # check if the document is labeled. If so, do not consider the label. 255 labeled = document and isinstance(document[0], (tuple, list)) 256 if labeled: 257 doc = document[0] 258 else: 259 doc = document 260 neg_scope = False 261 for i, word in enumerate(doc): 262 if NEGATION_RE.search(word): 263 if not neg_scope or (neg_scope and double_neg_flip): 264 neg_scope = not neg_scope 265 continue 266 else: 267 doc[i] += '_NEG' 268 elif neg_scope and CLAUSE_PUNCT_RE.search(word): 269 neg_scope = not neg_scope 270 elif neg_scope and not CLAUSE_PUNCT_RE.search(word): 271 doc[i] += '_NEG' 272 273 return document 274 275 276def output_markdown(filename, **kwargs): 277 """ 278 Write the output of an analysis to a file. 279 """ 280 with codecs.open(filename, 'at') as outfile: 281 text = '\n*** \n\n' 282 text += '{0} \n\n'.format(time.strftime("%d/%m/%Y, %H:%M")) 283 for k in sorted(kwargs): 284 if isinstance(kwargs[k], dict): 285 dictionary = kwargs[k] 286 text += ' - **{0}:**\n'.format(k) 287 for entry in sorted(dictionary): 288 text += ' - {0}: {1} \n'.format(entry, dictionary[entry]) 289 elif isinstance(kwargs[k], list): 290 text += ' - **{0}:**\n'.format(k) 291 for entry in kwargs[k]: 292 text += ' - {0}\n'.format(entry) 293 else: 294 text += ' - **{0}:** {1} \n'.format(k, kwargs[k]) 295 outfile.write(text) 296 297 298def save_file(content, filename): 299 """ 300 Store `content` in `filename`. Can be used to store a SentimentAnalyzer. 301 """ 302 print("Saving", filename) 303 with codecs.open(filename, 'wb') as storage_file: 304 # The protocol=2 parameter is for python2 compatibility 305 pickle.dump(content, storage_file, protocol=2) 306 307 308def split_train_test(all_instances, n=None): 309 """ 310 Randomly split `n` instances of the dataset into train and test sets. 311 312 :param all_instances: a list of instances (e.g. documents) that will be split. 313 :param n: the number of instances to consider (in case we want to use only a 314 subset). 315 :return: two lists of instances. Train set is 8/10 of the total and test set 316 is 2/10 of the total. 317 """ 318 random.seed(12345) 319 random.shuffle(all_instances) 320 if not n or n > len(all_instances): 321 n = len(all_instances) 322 train_set = all_instances[: int(0.8 * n)] 323 test_set = all_instances[int(0.8 * n) : n] 324 325 return train_set, test_set 326 327 328def _show_plot(x_values, y_values, x_labels=None, y_labels=None): 329 try: 330 import matplotlib.pyplot as plt 331 except ImportError: 332 raise ImportError( 333 'The plot function requires matplotlib to be installed.' 334 'See http://matplotlib.org/' 335 ) 336 337 plt.locator_params(axis='y', nbins=3) 338 axes = plt.axes() 339 axes.yaxis.grid() 340 plt.plot(x_values, y_values, 'ro', color='red') 341 plt.ylim(ymin=-1.2, ymax=1.2) 342 plt.tight_layout(pad=5) 343 if x_labels: 344 plt.xticks(x_values, x_labels, rotation='vertical') 345 if y_labels: 346 plt.yticks([-1, 0, 1], y_labels, rotation='horizontal') 347 # Pad margins so that markers are not clipped by the axes 348 plt.margins(0.2) 349 plt.show() 350 351 352# //////////////////////////////////////////////////////////// 353# { Parsing and conversion functions 354# //////////////////////////////////////////////////////////// 355 356 357def json2csv_preprocess( 358 json_file, 359 outfile, 360 fields, 361 encoding='utf8', 362 errors='replace', 363 gzip_compress=False, 364 skip_retweets=True, 365 skip_tongue_tweets=True, 366 skip_ambiguous_tweets=True, 367 strip_off_emoticons=True, 368 remove_duplicates=True, 369 limit=None, 370): 371 """ 372 Convert json file to csv file, preprocessing each row to obtain a suitable 373 dataset for tweets Semantic Analysis. 374 375 :param json_file: the original json file containing tweets. 376 :param outfile: the output csv filename. 377 :param fields: a list of fields that will be extracted from the json file and 378 kept in the output csv file. 379 :param encoding: the encoding of the files. 380 :param errors: the error handling strategy for the output writer. 381 :param gzip_compress: if True, create a compressed GZIP file. 382 383 :param skip_retweets: if True, remove retweets. 384 :param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P" 385 emoticons. 386 :param skip_ambiguous_tweets: if True, remove tweets containing both happy 387 and sad emoticons. 388 :param strip_off_emoticons: if True, strip off emoticons from all tweets. 389 :param remove_duplicates: if True, remove tweets appearing more than once. 390 :param limit: an integer to set the number of tweets to convert. After the 391 limit is reached the conversion will stop. It can be useful to create 392 subsets of the original tweets json data. 393 """ 394 with codecs.open(json_file, encoding=encoding) as fp: 395 (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress) 396 # write the list of fields as header 397 writer.writerow(fields) 398 399 if remove_duplicates == True: 400 tweets_cache = [] 401 i = 0 402 for line in fp: 403 tweet = json.loads(line) 404 row = extract_fields(tweet, fields) 405 try: 406 text = row[fields.index('text')] 407 # Remove retweets 408 if skip_retweets == True: 409 if re.search(r'\bRT\b', text): 410 continue 411 # Remove tweets containing ":P" and ":-P" emoticons 412 if skip_tongue_tweets == True: 413 if re.search(r'\:\-?P\b', text): 414 continue 415 # Remove tweets containing both happy and sad emoticons 416 if skip_ambiguous_tweets == True: 417 all_emoticons = EMOTICON_RE.findall(text) 418 if all_emoticons: 419 if (set(all_emoticons) & HAPPY) and (set(all_emoticons) & SAD): 420 continue 421 # Strip off emoticons from all tweets 422 if strip_off_emoticons == True: 423 row[fields.index('text')] = re.sub( 424 r'(?!\n)\s+', ' ', EMOTICON_RE.sub('', text) 425 ) 426 # Remove duplicate tweets 427 if remove_duplicates == True: 428 if row[fields.index('text')] in tweets_cache: 429 continue 430 else: 431 tweets_cache.append(row[fields.index('text')]) 432 except ValueError: 433 pass 434 writer.writerow(row) 435 i += 1 436 if limit and i >= limit: 437 break 438 outf.close() 439 440 441def parse_tweets_set( 442 filename, label, word_tokenizer=None, sent_tokenizer=None, skip_header=True 443): 444 """ 445 Parse csv file containing tweets and output data a list of (text, label) tuples. 446 447 :param filename: the input csv filename. 448 :param label: the label to be appended to each tweet contained in the csv file. 449 :param word_tokenizer: the tokenizer instance that will be used to tokenize 450 each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()). 451 If no word_tokenizer is specified, tweets will not be tokenized. 452 :param sent_tokenizer: the tokenizer that will be used to split each tweet into 453 sentences. 454 :param skip_header: if True, skip the first line of the csv file (which usually 455 contains headers). 456 457 :return: a list of (text, label) tuples. 458 """ 459 tweets = [] 460 if not sent_tokenizer: 461 sent_tokenizer = load('tokenizers/punkt/english.pickle') 462 463 # If we use Python3.x we can proceed using the 'rt' flag 464 if sys.version_info[0] == 3: 465 with codecs.open(filename, 'rt') as csvfile: 466 reader = csv.reader(csvfile) 467 if skip_header == True: 468 next(reader, None) # skip the header 469 i = 0 470 for tweet_id, text in reader: 471 # text = text[1] 472 i += 1 473 sys.stdout.write('Loaded {0} tweets\r'.format(i)) 474 # Apply sentence and word tokenizer to text 475 if word_tokenizer: 476 tweet = [ 477 w 478 for sent in sent_tokenizer.tokenize(text) 479 for w in word_tokenizer.tokenize(sent) 480 ] 481 else: 482 tweet = text 483 tweets.append((tweet, label)) 484 # If we use Python2.x we need to handle encoding problems 485 elif sys.version_info[0] < 3: 486 with codecs.open(filename) as csvfile: 487 reader = csv.reader(csvfile) 488 if skip_header == True: 489 next(reader, None) # skip the header 490 i = 0 491 for row in reader: 492 unicode_row = [x.decode('utf8') for x in row] 493 text = unicode_row[1] 494 i += 1 495 sys.stdout.write('Loaded {0} tweets\r'.format(i)) 496 # Apply sentence and word tokenizer to text 497 if word_tokenizer: 498 tweet = [ 499 w.encode('utf8') 500 for sent in sent_tokenizer.tokenize(text) 501 for w in word_tokenizer.tokenize(sent) 502 ] 503 else: 504 tweet = text 505 tweets.append((tweet, label)) 506 print("Loaded {0} tweets".format(i)) 507 return tweets 508 509 510# //////////////////////////////////////////////////////////// 511# { Demos 512# //////////////////////////////////////////////////////////// 513 514 515def demo_tweets(trainer, n_instances=None, output=None): 516 """ 517 Train and test Naive Bayes classifier on 10000 tweets, tokenized using 518 TweetTokenizer. 519 Features are composed of: 520 - 1000 most frequent unigrams 521 - 100 top bigrams (using BigramAssocMeasures.pmi) 522 523 :param trainer: `train` method of a classifier. 524 :param n_instances: the number of total tweets that have to be used for 525 training and testing. Tweets will be equally split between positive and 526 negative. 527 :param output: the output file where results have to be reported. 528 """ 529 from nltk.tokenize import TweetTokenizer 530 from nltk.sentiment import SentimentAnalyzer 531 from nltk.corpus import twitter_samples, stopwords 532 533 # Different customizations for the TweetTokenizer 534 tokenizer = TweetTokenizer(preserve_case=False) 535 # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True) 536 # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True) 537 538 if n_instances is not None: 539 n_instances = int(n_instances / 2) 540 541 fields = ['id', 'text'] 542 positive_json = twitter_samples.abspath("positive_tweets.json") 543 positive_csv = 'positive_tweets.csv' 544 json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances) 545 546 negative_json = twitter_samples.abspath("negative_tweets.json") 547 negative_csv = 'negative_tweets.csv' 548 json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances) 549 550 neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer) 551 pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer) 552 553 # We separately split subjective and objective instances to keep a balanced 554 # uniform class distribution in both train and test sets. 555 train_pos_docs, test_pos_docs = split_train_test(pos_docs) 556 train_neg_docs, test_neg_docs = split_train_test(neg_docs) 557 558 training_tweets = train_pos_docs + train_neg_docs 559 testing_tweets = test_pos_docs + test_neg_docs 560 561 sentim_analyzer = SentimentAnalyzer() 562 # stopwords = stopwords.words('english') 563 # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords] 564 all_words = [word for word in sentim_analyzer.all_words(training_tweets)] 565 566 # Add simple unigram word features 567 unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000) 568 sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) 569 570 # Add bigram collocation features 571 bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats( 572 [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12 573 ) 574 sentim_analyzer.add_feat_extractor( 575 extract_bigram_feats, bigrams=bigram_collocs_feats 576 ) 577 578 training_set = sentim_analyzer.apply_features(training_tweets) 579 test_set = sentim_analyzer.apply_features(testing_tweets) 580 581 classifier = sentim_analyzer.train(trainer, training_set) 582 # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4) 583 try: 584 classifier.show_most_informative_features() 585 except AttributeError: 586 print( 587 'Your classifier does not provide a show_most_informative_features() method.' 588 ) 589 results = sentim_analyzer.evaluate(test_set) 590 591 if output: 592 extr = [f.__name__ for f in sentim_analyzer.feat_extractors] 593 output_markdown( 594 output, 595 Dataset='labeled_tweets', 596 Classifier=type(classifier).__name__, 597 Tokenizer=tokenizer.__class__.__name__, 598 Feats=extr, 599 Results=results, 600 Instances=n_instances, 601 ) 602 603 604def demo_movie_reviews(trainer, n_instances=None, output=None): 605 """ 606 Train classifier on all instances of the Movie Reviews dataset. 607 The corpus has been preprocessed using the default sentence tokenizer and 608 WordPunctTokenizer. 609 Features are composed of: 610 - most frequent unigrams 611 612 :param trainer: `train` method of a classifier. 613 :param n_instances: the number of total reviews that have to be used for 614 training and testing. Reviews will be equally split between positive and 615 negative. 616 :param output: the output file where results have to be reported. 617 """ 618 from nltk.corpus import movie_reviews 619 from nltk.sentiment import SentimentAnalyzer 620 621 if n_instances is not None: 622 n_instances = int(n_instances / 2) 623 624 pos_docs = [ 625 (list(movie_reviews.words(pos_id)), 'pos') 626 for pos_id in movie_reviews.fileids('pos')[:n_instances] 627 ] 628 neg_docs = [ 629 (list(movie_reviews.words(neg_id)), 'neg') 630 for neg_id in movie_reviews.fileids('neg')[:n_instances] 631 ] 632 # We separately split positive and negative instances to keep a balanced 633 # uniform class distribution in both train and test sets. 634 train_pos_docs, test_pos_docs = split_train_test(pos_docs) 635 train_neg_docs, test_neg_docs = split_train_test(neg_docs) 636 637 training_docs = train_pos_docs + train_neg_docs 638 testing_docs = test_pos_docs + test_neg_docs 639 640 sentim_analyzer = SentimentAnalyzer() 641 all_words = sentim_analyzer.all_words(training_docs) 642 643 # Add simple unigram word features 644 unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4) 645 sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) 646 # Apply features to obtain a feature-value representation of our datasets 647 training_set = sentim_analyzer.apply_features(training_docs) 648 test_set = sentim_analyzer.apply_features(testing_docs) 649 650 classifier = sentim_analyzer.train(trainer, training_set) 651 try: 652 classifier.show_most_informative_features() 653 except AttributeError: 654 print( 655 'Your classifier does not provide a show_most_informative_features() method.' 656 ) 657 results = sentim_analyzer.evaluate(test_set) 658 659 if output: 660 extr = [f.__name__ for f in sentim_analyzer.feat_extractors] 661 output_markdown( 662 output, 663 Dataset='Movie_reviews', 664 Classifier=type(classifier).__name__, 665 Tokenizer='WordPunctTokenizer', 666 Feats=extr, 667 Results=results, 668 Instances=n_instances, 669 ) 670 671 672def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None): 673 """ 674 Train and test a classifier on instances of the Subjective Dataset by Pang and 675 Lee. The dataset is made of 5000 subjective and 5000 objective sentences. 676 All tokens (words and punctuation marks) are separated by a whitespace, so 677 we use the basic WhitespaceTokenizer to parse the data. 678 679 :param trainer: `train` method of a classifier. 680 :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file. 681 :param n_instances: the number of total sentences that have to be used for 682 training and testing. Sentences will be equally split between positive 683 and negative. 684 :param output: the output file where results have to be reported. 685 """ 686 from nltk.sentiment import SentimentAnalyzer 687 from nltk.corpus import subjectivity 688 689 if n_instances is not None: 690 n_instances = int(n_instances / 2) 691 692 subj_docs = [ 693 (sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances] 694 ] 695 obj_docs = [ 696 (sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances] 697 ] 698 699 # We separately split subjective and objective instances to keep a balanced 700 # uniform class distribution in both train and test sets. 701 train_subj_docs, test_subj_docs = split_train_test(subj_docs) 702 train_obj_docs, test_obj_docs = split_train_test(obj_docs) 703 704 training_docs = train_subj_docs + train_obj_docs 705 testing_docs = test_subj_docs + test_obj_docs 706 707 sentim_analyzer = SentimentAnalyzer() 708 all_words_neg = sentim_analyzer.all_words( 709 [mark_negation(doc) for doc in training_docs] 710 ) 711 712 # Add simple unigram word features handling negation 713 unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) 714 sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) 715 716 # Apply features to obtain a feature-value representation of our datasets 717 training_set = sentim_analyzer.apply_features(training_docs) 718 test_set = sentim_analyzer.apply_features(testing_docs) 719 720 classifier = sentim_analyzer.train(trainer, training_set) 721 try: 722 classifier.show_most_informative_features() 723 except AttributeError: 724 print( 725 'Your classifier does not provide a show_most_informative_features() method.' 726 ) 727 results = sentim_analyzer.evaluate(test_set) 728 729 if save_analyzer == True: 730 save_file(sentim_analyzer, 'sa_subjectivity.pickle') 731 732 if output: 733 extr = [f.__name__ for f in sentim_analyzer.feat_extractors] 734 output_markdown( 735 output, 736 Dataset='subjectivity', 737 Classifier=type(classifier).__name__, 738 Tokenizer='WhitespaceTokenizer', 739 Feats=extr, 740 Instances=n_instances, 741 Results=results, 742 ) 743 744 return sentim_analyzer 745 746 747def demo_sent_subjectivity(text): 748 """ 749 Classify a single sentence as subjective or objective using a stored 750 SentimentAnalyzer. 751 752 :param text: a sentence whose subjectivity has to be classified. 753 """ 754 from nltk.classify import NaiveBayesClassifier 755 from nltk.tokenize import regexp 756 757 word_tokenizer = regexp.WhitespaceTokenizer() 758 try: 759 sentim_analyzer = load('sa_subjectivity.pickle') 760 except LookupError: 761 print('Cannot find the sentiment analyzer you want to load.') 762 print('Training a new one using NaiveBayesClassifier.') 763 sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True) 764 765 # Tokenize and convert to lower case 766 tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)] 767 print(sentim_analyzer.classify(tokenized_text)) 768 769 770def demo_liu_hu_lexicon(sentence, plot=False): 771 """ 772 Basic example of sentiment classification using Liu and Hu opinion lexicon. 773 This function simply counts the number of positive, negative and neutral words 774 in the sentence and classifies it depending on which polarity is more represented. 775 Words that do not appear in the lexicon are considered as neutral. 776 777 :param sentence: a sentence whose polarity has to be classified. 778 :param plot: if True, plot a visual representation of the sentence polarity. 779 """ 780 from nltk.corpus import opinion_lexicon 781 from nltk.tokenize import treebank 782 783 tokenizer = treebank.TreebankWordTokenizer() 784 pos_words = 0 785 neg_words = 0 786 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] 787 788 x = list(range(len(tokenized_sent))) # x axis for the plot 789 y = [] 790 791 for word in tokenized_sent: 792 if word in opinion_lexicon.positive(): 793 pos_words += 1 794 y.append(1) # positive 795 elif word in opinion_lexicon.negative(): 796 neg_words += 1 797 y.append(-1) # negative 798 else: 799 y.append(0) # neutral 800 801 if pos_words > neg_words: 802 print('Positive') 803 elif pos_words < neg_words: 804 print('Negative') 805 elif pos_words == neg_words: 806 print('Neutral') 807 808 if plot == True: 809 _show_plot( 810 x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'] 811 ) 812 813 814def demo_vader_instance(text): 815 """ 816 Output polarity scores for a text using Vader approach. 817 818 :param text: a text whose polarity has to be evaluated. 819 """ 820 from nltk.sentiment import SentimentIntensityAnalyzer 821 822 vader_analyzer = SentimentIntensityAnalyzer() 823 print(vader_analyzer.polarity_scores(text)) 824 825 826def demo_vader_tweets(n_instances=None, output=None): 827 """ 828 Classify 10000 positive and negative tweets using Vader approach. 829 830 :param n_instances: the number of total tweets that have to be classified. 831 :param output: the output file where results have to be reported. 832 """ 833 from collections import defaultdict 834 from nltk.corpus import twitter_samples 835 from nltk.sentiment import SentimentIntensityAnalyzer 836 from nltk.metrics import ( 837 accuracy as eval_accuracy, 838 precision as eval_precision, 839 recall as eval_recall, 840 f_measure as eval_f_measure, 841 ) 842 843 if n_instances is not None: 844 n_instances = int(n_instances / 2) 845 846 fields = ['id', 'text'] 847 positive_json = twitter_samples.abspath("positive_tweets.json") 848 positive_csv = 'positive_tweets.csv' 849 json2csv_preprocess( 850 positive_json, 851 positive_csv, 852 fields, 853 strip_off_emoticons=False, 854 limit=n_instances, 855 ) 856 857 negative_json = twitter_samples.abspath("negative_tweets.json") 858 negative_csv = 'negative_tweets.csv' 859 json2csv_preprocess( 860 negative_json, 861 negative_csv, 862 fields, 863 strip_off_emoticons=False, 864 limit=n_instances, 865 ) 866 867 pos_docs = parse_tweets_set(positive_csv, label='pos') 868 neg_docs = parse_tweets_set(negative_csv, label='neg') 869 870 # We separately split subjective and objective instances to keep a balanced 871 # uniform class distribution in both train and test sets. 872 train_pos_docs, test_pos_docs = split_train_test(pos_docs) 873 train_neg_docs, test_neg_docs = split_train_test(neg_docs) 874 875 training_tweets = train_pos_docs + train_neg_docs 876 testing_tweets = test_pos_docs + test_neg_docs 877 878 vader_analyzer = SentimentIntensityAnalyzer() 879 880 gold_results = defaultdict(set) 881 test_results = defaultdict(set) 882 acc_gold_results = [] 883 acc_test_results = [] 884 labels = set() 885 num = 0 886 for i, (text, label) in enumerate(testing_tweets): 887 labels.add(label) 888 gold_results[label].add(i) 889 acc_gold_results.append(label) 890 score = vader_analyzer.polarity_scores(text)['compound'] 891 if score > 0: 892 observed = 'pos' 893 else: 894 observed = 'neg' 895 num += 1 896 acc_test_results.append(observed) 897 test_results[observed].add(i) 898 metrics_results = {} 899 for label in labels: 900 accuracy_score = eval_accuracy(acc_gold_results, acc_test_results) 901 metrics_results['Accuracy'] = accuracy_score 902 precision_score = eval_precision(gold_results[label], test_results[label]) 903 metrics_results['Precision [{0}]'.format(label)] = precision_score 904 recall_score = eval_recall(gold_results[label], test_results[label]) 905 metrics_results['Recall [{0}]'.format(label)] = recall_score 906 f_measure_score = eval_f_measure(gold_results[label], test_results[label]) 907 metrics_results['F-measure [{0}]'.format(label)] = f_measure_score 908 909 for result in sorted(metrics_results): 910 print('{0}: {1}'.format(result, metrics_results[result])) 911 912 if output: 913 output_markdown( 914 output, 915 Approach='Vader', 916 Dataset='labeled_tweets', 917 Instances=n_instances, 918 Results=metrics_results, 919 ) 920 921 922if __name__ == '__main__': 923 from nltk.classify import NaiveBayesClassifier, MaxentClassifier 924 from nltk.classify.scikitlearn import SklearnClassifier 925 from sklearn.svm import LinearSVC 926 from nltk.twitter.common import outf_writer_compat, extract_fields 927 928 naive_bayes = NaiveBayesClassifier.train 929 svm = SklearnClassifier(LinearSVC()).train 930 maxent = MaxentClassifier.train 931 932 demo_tweets(naive_bayes) 933 # demo_movie_reviews(svm) 934 # demo_subjectivity(svm) 935 # demo_sent_subjectivity("she's an artist , but hasn't picked up a brush in a year . ") 936 # demo_liu_hu_lexicon("This movie was actually neither that funny, nor super witty.", plot=True) 937 # demo_vader_instance("This movie was actually neither that funny, nor super witty.") 938 # demo_vader_tweets() 939