1# coding: utf-8 2# Natural Language Toolkit: vader 3# 4# Copyright (C) 2001-2019 NLTK Project 5# Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu> 6# Ewan Klein <ewan@inf.ed.ac.uk> (modifications) 7# Pierpaolo Pantone <24alsecondo@gmail.com> (modifications) 8# George Berry <geb97@cornell.edu> (modifications) 9# URL: <http://nltk.org/> 10# For license information, see LICENSE.TXT 11# 12# Modifications to the original VADER code have been made in order to 13# integrate it into NLTK. These have involved changes to 14# ensure Python 3 compatibility, and refactoring to achieve greater modularity. 15 16""" 17If you use the VADER sentiment analysis tools, please cite: 18 19Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for 20Sentiment Analysis of Social Media Text. Eighth International Conference on 21Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. 22""" 23 24import math 25import re 26import string 27from itertools import product 28import nltk.data 29from .util import pairwise 30 31##Constants## 32 33# (empirically derived mean sentiment intensity rating increase for booster words) 34B_INCR = 0.293 35B_DECR = -0.293 36 37# (empirically derived mean sentiment intensity rating increase for using 38# ALLCAPs to emphasize a word) 39C_INCR = 0.733 40 41N_SCALAR = -0.74 42 43# for removing punctuation 44REGEX_REMOVE_PUNCTUATION = re.compile('[{0}]'.format(re.escape(string.punctuation))) 45 46PUNC_LIST = [ 47 ".", 48 "!", 49 "?", 50 ",", 51 ";", 52 ":", 53 "-", 54 "'", 55 "\"", 56 "!!", 57 "!!!", 58 "??", 59 "???", 60 "?!?", 61 "!?!", 62 "?!?!", 63 "!?!?", 64] 65NEGATE = { 66 "aint", 67 "arent", 68 "cannot", 69 "cant", 70 "couldnt", 71 "darent", 72 "didnt", 73 "doesnt", 74 "ain't", 75 "aren't", 76 "can't", 77 "couldn't", 78 "daren't", 79 "didn't", 80 "doesn't", 81 "dont", 82 "hadnt", 83 "hasnt", 84 "havent", 85 "isnt", 86 "mightnt", 87 "mustnt", 88 "neither", 89 "don't", 90 "hadn't", 91 "hasn't", 92 "haven't", 93 "isn't", 94 "mightn't", 95 "mustn't", 96 "neednt", 97 "needn't", 98 "never", 99 "none", 100 "nope", 101 "nor", 102 "not", 103 "nothing", 104 "nowhere", 105 "oughtnt", 106 "shant", 107 "shouldnt", 108 "uhuh", 109 "wasnt", 110 "werent", 111 "oughtn't", 112 "shan't", 113 "shouldn't", 114 "uh-uh", 115 "wasn't", 116 "weren't", 117 "without", 118 "wont", 119 "wouldnt", 120 "won't", 121 "wouldn't", 122 "rarely", 123 "seldom", 124 "despite", 125} 126 127# booster/dampener 'intensifiers' or 'degree adverbs' 128# http://en.wiktionary.org/wiki/Category:English_degree_adverbs 129 130BOOSTER_DICT = { 131 "absolutely": B_INCR, 132 "amazingly": B_INCR, 133 "awfully": B_INCR, 134 "completely": B_INCR, 135 "considerably": B_INCR, 136 "decidedly": B_INCR, 137 "deeply": B_INCR, 138 "effing": B_INCR, 139 "enormously": B_INCR, 140 "entirely": B_INCR, 141 "especially": B_INCR, 142 "exceptionally": B_INCR, 143 "extremely": B_INCR, 144 "fabulously": B_INCR, 145 "flipping": B_INCR, 146 "flippin": B_INCR, 147 "fricking": B_INCR, 148 "frickin": B_INCR, 149 "frigging": B_INCR, 150 "friggin": B_INCR, 151 "fully": B_INCR, 152 "fucking": B_INCR, 153 "greatly": B_INCR, 154 "hella": B_INCR, 155 "highly": B_INCR, 156 "hugely": B_INCR, 157 "incredibly": B_INCR, 158 "intensely": B_INCR, 159 "majorly": B_INCR, 160 "more": B_INCR, 161 "most": B_INCR, 162 "particularly": B_INCR, 163 "purely": B_INCR, 164 "quite": B_INCR, 165 "really": B_INCR, 166 "remarkably": B_INCR, 167 "so": B_INCR, 168 "substantially": B_INCR, 169 "thoroughly": B_INCR, 170 "totally": B_INCR, 171 "tremendously": B_INCR, 172 "uber": B_INCR, 173 "unbelievably": B_INCR, 174 "unusually": B_INCR, 175 "utterly": B_INCR, 176 "very": B_INCR, 177 "almost": B_DECR, 178 "barely": B_DECR, 179 "hardly": B_DECR, 180 "just enough": B_DECR, 181 "kind of": B_DECR, 182 "kinda": B_DECR, 183 "kindof": B_DECR, 184 "kind-of": B_DECR, 185 "less": B_DECR, 186 "little": B_DECR, 187 "marginally": B_DECR, 188 "occasionally": B_DECR, 189 "partly": B_DECR, 190 "scarcely": B_DECR, 191 "slightly": B_DECR, 192 "somewhat": B_DECR, 193 "sort of": B_DECR, 194 "sorta": B_DECR, 195 "sortof": B_DECR, 196 "sort-of": B_DECR, 197} 198 199# check for special case idioms using a sentiment-laden keyword known to SAGE 200SPECIAL_CASE_IDIOMS = { 201 "the shit": 3, 202 "the bomb": 3, 203 "bad ass": 1.5, 204 "yeah right": -2, 205 "cut the mustard": 2, 206 "kiss of death": -1.5, 207 "hand to mouth": -2, 208} 209 210 211##Static methods## 212 213 214def negated(input_words, include_nt=True): 215 """ 216 Determine if input contains negation words 217 """ 218 neg_words = NEGATE 219 if any(word.lower() in neg_words for word in input_words): 220 return True 221 if include_nt: 222 if any("n't" in word.lower() for word in input_words): 223 return True 224 for first, second in pairwise(input_words): 225 if second.lower() == "least" and first.lower() != 'at': 226 return True 227 return False 228 229 230def normalize(score, alpha=15): 231 """ 232 Normalize the score to be between -1 and 1 using an alpha that 233 approximates the max expected value 234 """ 235 norm_score = score / math.sqrt((score * score) + alpha) 236 return norm_score 237 238 239def allcap_differential(words): 240 """ 241 Check whether just some words in the input are ALL CAPS 242 243 :param list words: The words to inspect 244 :returns: `True` if some but not all items in `words` are ALL CAPS 245 """ 246 is_different = False 247 allcap_words = 0 248 for word in words: 249 if word.isupper(): 250 allcap_words += 1 251 cap_differential = len(words) - allcap_words 252 if 0 < cap_differential < len(words): 253 is_different = True 254 return is_different 255 256 257def scalar_inc_dec(word, valence, is_cap_diff): 258 """ 259 Check if the preceding words increase, decrease, or negate/nullify the 260 valence 261 """ 262 scalar = 0.0 263 word_lower = word.lower() 264 if word_lower in BOOSTER_DICT: 265 scalar = BOOSTER_DICT[word_lower] 266 if valence < 0: 267 scalar *= -1 268 # check if booster/dampener word is in ALLCAPS (while others aren't) 269 if word.isupper() and is_cap_diff: 270 if valence > 0: 271 scalar += C_INCR 272 else: 273 scalar -= C_INCR 274 return scalar 275 276 277class SentiText(object): 278 """ 279 Identify sentiment-relevant string-level properties of input text. 280 """ 281 282 def __init__(self, text): 283 if not isinstance(text, str): 284 text = str(text.encode('utf-8')) 285 self.text = text 286 self.words_and_emoticons = self._words_and_emoticons() 287 # doesn't separate words from\ 288 # adjacent punctuation (keeps emoticons & contractions) 289 self.is_cap_diff = allcap_differential(self.words_and_emoticons) 290 291 def _words_plus_punc(self): 292 """ 293 Returns mapping of form: 294 { 295 'cat,': 'cat', 296 ',cat': 'cat', 297 } 298 """ 299 no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text) 300 # removes punctuation (but loses emoticons & contractions) 301 words_only = no_punc_text.split() 302 # remove singletons 303 words_only = set(w for w in words_only if len(w) > 1) 304 # the product gives ('cat', ',') and (',', 'cat') 305 punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)} 306 punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)} 307 words_punc_dict = punc_before 308 words_punc_dict.update(punc_after) 309 return words_punc_dict 310 311 def _words_and_emoticons(self): 312 """ 313 Removes leading and trailing puncutation 314 Leaves contractions and most emoticons 315 Does not preserve punc-plus-letter emoticons (e.g. :D) 316 """ 317 wes = self.text.split() 318 words_punc_dict = self._words_plus_punc() 319 wes = [we for we in wes if len(we) > 1] 320 for i, we in enumerate(wes): 321 if we in words_punc_dict: 322 wes[i] = words_punc_dict[we] 323 return wes 324 325 326class SentimentIntensityAnalyzer(object): 327 """ 328 Give a sentiment intensity score to sentences. 329 """ 330 331 def __init__( 332 self, lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt" 333 ): 334 self.lexicon_file = nltk.data.load(lexicon_file) 335 self.lexicon = self.make_lex_dict() 336 337 def make_lex_dict(self): 338 """ 339 Convert lexicon file to a dictionary 340 """ 341 lex_dict = {} 342 for line in self.lexicon_file.split('\n'): 343 (word, measure) = line.strip().split('\t')[0:2] 344 lex_dict[word] = float(measure) 345 return lex_dict 346 347 def polarity_scores(self, text): 348 """ 349 Return a float for sentiment strength based on the input text. 350 Positive values are positive valence, negative value are negative 351 valence. 352 """ 353 sentitext = SentiText(text) 354 # text, words_and_emoticons, is_cap_diff = self.preprocess(text) 355 356 sentiments = [] 357 words_and_emoticons = sentitext.words_and_emoticons 358 for item in words_and_emoticons: 359 valence = 0 360 i = words_and_emoticons.index(item) 361 if ( 362 i < len(words_and_emoticons) - 1 363 and item.lower() == "kind" 364 and words_and_emoticons[i + 1].lower() == "of" 365 ) or item.lower() in BOOSTER_DICT: 366 sentiments.append(valence) 367 continue 368 369 sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments) 370 371 sentiments = self._but_check(words_and_emoticons, sentiments) 372 373 return self.score_valence(sentiments, text) 374 375 def sentiment_valence(self, valence, sentitext, item, i, sentiments): 376 is_cap_diff = sentitext.is_cap_diff 377 words_and_emoticons = sentitext.words_and_emoticons 378 item_lowercase = item.lower() 379 if item_lowercase in self.lexicon: 380 # get the sentiment valence 381 valence = self.lexicon[item_lowercase] 382 383 # check if sentiment laden word is in ALL CAPS (while others aren't) 384 if item.isupper() and is_cap_diff: 385 if valence > 0: 386 valence += C_INCR 387 else: 388 valence -= C_INCR 389 390 for start_i in range(0, 3): 391 if ( 392 i > start_i 393 and words_and_emoticons[i - (start_i + 1)].lower() 394 not in self.lexicon 395 ): 396 # dampen the scalar modifier of preceding words and emoticons 397 # (excluding the ones that immediately preceed the item) based 398 # on their distance from the current item. 399 s = scalar_inc_dec( 400 words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff 401 ) 402 if start_i == 1 and s != 0: 403 s = s * 0.95 404 if start_i == 2 and s != 0: 405 s = s * 0.9 406 valence = valence + s 407 valence = self._never_check( 408 valence, words_and_emoticons, start_i, i 409 ) 410 if start_i == 2: 411 valence = self._idioms_check(valence, words_and_emoticons, i) 412 413 # future work: consider other sentiment-laden idioms 414 # other_idioms = 415 # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2, 416 # "upper hand": 1, "break a leg": 2, 417 # "cooking with gas": 2, "in the black": 2, "in the red": -2, 418 # "on the ball": 2,"under the weather": -2} 419 420 valence = self._least_check(valence, words_and_emoticons, i) 421 422 sentiments.append(valence) 423 return sentiments 424 425 def _least_check(self, valence, words_and_emoticons, i): 426 # check for negation case using "least" 427 if ( 428 i > 1 429 and words_and_emoticons[i - 1].lower() not in self.lexicon 430 and words_and_emoticons[i - 1].lower() == "least" 431 ): 432 if ( 433 words_and_emoticons[i - 2].lower() != "at" 434 and words_and_emoticons[i - 2].lower() != "very" 435 ): 436 valence = valence * N_SCALAR 437 elif ( 438 i > 0 439 and words_and_emoticons[i - 1].lower() not in self.lexicon 440 and words_and_emoticons[i - 1].lower() == "least" 441 ): 442 valence = valence * N_SCALAR 443 return valence 444 445 def _but_check(self, words_and_emoticons, sentiments): 446 # check for modification in sentiment due to contrastive conjunction 'but' 447 if 'but' in words_and_emoticons or 'BUT' in words_and_emoticons: 448 try: 449 bi = words_and_emoticons.index('but') 450 except ValueError: 451 bi = words_and_emoticons.index('BUT') 452 for sentiment in sentiments: 453 si = sentiments.index(sentiment) 454 if si < bi: 455 sentiments.pop(si) 456 sentiments.insert(si, sentiment * 0.5) 457 elif si > bi: 458 sentiments.pop(si) 459 sentiments.insert(si, sentiment * 1.5) 460 return sentiments 461 462 def _idioms_check(self, valence, words_and_emoticons, i): 463 onezero = "{0} {1}".format(words_and_emoticons[i - 1], words_and_emoticons[i]) 464 465 twoonezero = "{0} {1} {2}".format( 466 words_and_emoticons[i - 2], 467 words_and_emoticons[i - 1], 468 words_and_emoticons[i], 469 ) 470 471 twoone = "{0} {1}".format( 472 words_and_emoticons[i - 2], words_and_emoticons[i - 1] 473 ) 474 475 threetwoone = "{0} {1} {2}".format( 476 words_and_emoticons[i - 3], 477 words_and_emoticons[i - 2], 478 words_and_emoticons[i - 1], 479 ) 480 481 threetwo = "{0} {1}".format( 482 words_and_emoticons[i - 3], words_and_emoticons[i - 2] 483 ) 484 485 sequences = [onezero, twoonezero, twoone, threetwoone, threetwo] 486 487 for seq in sequences: 488 if seq in SPECIAL_CASE_IDIOMS: 489 valence = SPECIAL_CASE_IDIOMS[seq] 490 break 491 492 if len(words_and_emoticons) - 1 > i: 493 zeroone = "{0} {1}".format( 494 words_and_emoticons[i], words_and_emoticons[i + 1] 495 ) 496 if zeroone in SPECIAL_CASE_IDIOMS: 497 valence = SPECIAL_CASE_IDIOMS[zeroone] 498 if len(words_and_emoticons) - 1 > i + 1: 499 zeroonetwo = "{0} {1} {2}".format( 500 words_and_emoticons[i], 501 words_and_emoticons[i + 1], 502 words_and_emoticons[i + 2], 503 ) 504 if zeroonetwo in SPECIAL_CASE_IDIOMS: 505 valence = SPECIAL_CASE_IDIOMS[zeroonetwo] 506 507 # check for booster/dampener bi-grams such as 'sort of' or 'kind of' 508 if threetwo in BOOSTER_DICT or twoone in BOOSTER_DICT: 509 valence = valence + B_DECR 510 return valence 511 512 def _never_check(self, valence, words_and_emoticons, start_i, i): 513 if start_i == 0: 514 if negated([words_and_emoticons[i - 1]]): 515 valence = valence * N_SCALAR 516 if start_i == 1: 517 if words_and_emoticons[i - 2] == "never" and ( 518 words_and_emoticons[i - 1] == "so" 519 or words_and_emoticons[i - 1] == "this" 520 ): 521 valence = valence * 1.5 522 elif negated([words_and_emoticons[i - (start_i + 1)]]): 523 valence = valence * N_SCALAR 524 if start_i == 2: 525 if ( 526 words_and_emoticons[i - 3] == "never" 527 and ( 528 words_and_emoticons[i - 2] == "so" 529 or words_and_emoticons[i - 2] == "this" 530 ) 531 or ( 532 words_and_emoticons[i - 1] == "so" 533 or words_and_emoticons[i - 1] == "this" 534 ) 535 ): 536 valence = valence * 1.25 537 elif negated([words_and_emoticons[i - (start_i + 1)]]): 538 valence = valence * N_SCALAR 539 return valence 540 541 def _punctuation_emphasis(self, sum_s, text): 542 # add emphasis from exclamation points and question marks 543 ep_amplifier = self._amplify_ep(text) 544 qm_amplifier = self._amplify_qm(text) 545 punct_emph_amplifier = ep_amplifier + qm_amplifier 546 return punct_emph_amplifier 547 548 def _amplify_ep(self, text): 549 # check for added emphasis resulting from exclamation points (up to 4 of them) 550 ep_count = text.count("!") 551 if ep_count > 4: 552 ep_count = 4 553 # (empirically derived mean sentiment intensity rating increase for 554 # exclamation points) 555 ep_amplifier = ep_count * 0.292 556 return ep_amplifier 557 558 def _amplify_qm(self, text): 559 # check for added emphasis resulting from question marks (2 or 3+) 560 qm_count = text.count("?") 561 qm_amplifier = 0 562 if qm_count > 1: 563 if qm_count <= 3: 564 # (empirically derived mean sentiment intensity rating increase for 565 # question marks) 566 qm_amplifier = qm_count * 0.18 567 else: 568 qm_amplifier = 0.96 569 return qm_amplifier 570 571 def _sift_sentiment_scores(self, sentiments): 572 # want separate positive versus negative sentiment scores 573 pos_sum = 0.0 574 neg_sum = 0.0 575 neu_count = 0 576 for sentiment_score in sentiments: 577 if sentiment_score > 0: 578 pos_sum += ( 579 float(sentiment_score) + 1 580 ) # compensates for neutral words that are counted as 1 581 if sentiment_score < 0: 582 neg_sum += ( 583 float(sentiment_score) - 1 584 ) # when used with math.fabs(), compensates for neutrals 585 if sentiment_score == 0: 586 neu_count += 1 587 return pos_sum, neg_sum, neu_count 588 589 def score_valence(self, sentiments, text): 590 if sentiments: 591 sum_s = float(sum(sentiments)) 592 # compute and add emphasis from punctuation in text 593 punct_emph_amplifier = self._punctuation_emphasis(sum_s, text) 594 if sum_s > 0: 595 sum_s += punct_emph_amplifier 596 elif sum_s < 0: 597 sum_s -= punct_emph_amplifier 598 599 compound = normalize(sum_s) 600 # discriminate between positive, negative and neutral sentiment scores 601 pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments) 602 603 if pos_sum > math.fabs(neg_sum): 604 pos_sum += punct_emph_amplifier 605 elif pos_sum < math.fabs(neg_sum): 606 neg_sum -= punct_emph_amplifier 607 608 total = pos_sum + math.fabs(neg_sum) + neu_count 609 pos = math.fabs(pos_sum / total) 610 neg = math.fabs(neg_sum / total) 611 neu = math.fabs(neu_count / total) 612 613 else: 614 compound = 0.0 615 pos = 0.0 616 neg = 0.0 617 neu = 0.0 618 619 sentiment_dict = { 620 "neg": round(neg, 3), 621 "neu": round(neu, 3), 622 "pos": round(pos, 3), 623 "compound": round(compound, 4), 624 } 625 626 return sentiment_dict 627