1# coding: utf-8
2# Natural Language Toolkit: vader
3#
4# Copyright (C) 2001-2019 NLTK Project
5# Author: C.J. Hutto <Clayton.Hutto@gtri.gatech.edu>
6#         Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
7#         Pierpaolo Pantone <24alsecondo@gmail.com> (modifications)
8#         George Berry <geb97@cornell.edu> (modifications)
9# URL: <http://nltk.org/>
10# For license information, see LICENSE.TXT
11#
12# Modifications to the original VADER code have been made in order to
13# integrate it into NLTK. These have involved changes to
14# ensure Python 3 compatibility, and refactoring to achieve greater modularity.
15
16"""
17If you use the VADER sentiment analysis tools, please cite:
18
19Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
20Sentiment Analysis of Social Media Text. Eighth International Conference on
21Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
22"""
23
24import math
25import re
26import string
27from itertools import product
28import nltk.data
29from .util import pairwise
30
31##Constants##
32
33# (empirically derived mean sentiment intensity rating increase for booster words)
34B_INCR = 0.293
35B_DECR = -0.293
36
37# (empirically derived mean sentiment intensity rating increase for using
38# ALLCAPs to emphasize a word)
39C_INCR = 0.733
40
41N_SCALAR = -0.74
42
43# for removing punctuation
44REGEX_REMOVE_PUNCTUATION = re.compile('[{0}]'.format(re.escape(string.punctuation)))
45
46PUNC_LIST = [
47    ".",
48    "!",
49    "?",
50    ",",
51    ";",
52    ":",
53    "-",
54    "'",
55    "\"",
56    "!!",
57    "!!!",
58    "??",
59    "???",
60    "?!?",
61    "!?!",
62    "?!?!",
63    "!?!?",
64]
65NEGATE = {
66    "aint",
67    "arent",
68    "cannot",
69    "cant",
70    "couldnt",
71    "darent",
72    "didnt",
73    "doesnt",
74    "ain't",
75    "aren't",
76    "can't",
77    "couldn't",
78    "daren't",
79    "didn't",
80    "doesn't",
81    "dont",
82    "hadnt",
83    "hasnt",
84    "havent",
85    "isnt",
86    "mightnt",
87    "mustnt",
88    "neither",
89    "don't",
90    "hadn't",
91    "hasn't",
92    "haven't",
93    "isn't",
94    "mightn't",
95    "mustn't",
96    "neednt",
97    "needn't",
98    "never",
99    "none",
100    "nope",
101    "nor",
102    "not",
103    "nothing",
104    "nowhere",
105    "oughtnt",
106    "shant",
107    "shouldnt",
108    "uhuh",
109    "wasnt",
110    "werent",
111    "oughtn't",
112    "shan't",
113    "shouldn't",
114    "uh-uh",
115    "wasn't",
116    "weren't",
117    "without",
118    "wont",
119    "wouldnt",
120    "won't",
121    "wouldn't",
122    "rarely",
123    "seldom",
124    "despite",
125}
126
127# booster/dampener 'intensifiers' or 'degree adverbs'
128# http://en.wiktionary.org/wiki/Category:English_degree_adverbs
129
130BOOSTER_DICT = {
131    "absolutely": B_INCR,
132    "amazingly": B_INCR,
133    "awfully": B_INCR,
134    "completely": B_INCR,
135    "considerably": B_INCR,
136    "decidedly": B_INCR,
137    "deeply": B_INCR,
138    "effing": B_INCR,
139    "enormously": B_INCR,
140    "entirely": B_INCR,
141    "especially": B_INCR,
142    "exceptionally": B_INCR,
143    "extremely": B_INCR,
144    "fabulously": B_INCR,
145    "flipping": B_INCR,
146    "flippin": B_INCR,
147    "fricking": B_INCR,
148    "frickin": B_INCR,
149    "frigging": B_INCR,
150    "friggin": B_INCR,
151    "fully": B_INCR,
152    "fucking": B_INCR,
153    "greatly": B_INCR,
154    "hella": B_INCR,
155    "highly": B_INCR,
156    "hugely": B_INCR,
157    "incredibly": B_INCR,
158    "intensely": B_INCR,
159    "majorly": B_INCR,
160    "more": B_INCR,
161    "most": B_INCR,
162    "particularly": B_INCR,
163    "purely": B_INCR,
164    "quite": B_INCR,
165    "really": B_INCR,
166    "remarkably": B_INCR,
167    "so": B_INCR,
168    "substantially": B_INCR,
169    "thoroughly": B_INCR,
170    "totally": B_INCR,
171    "tremendously": B_INCR,
172    "uber": B_INCR,
173    "unbelievably": B_INCR,
174    "unusually": B_INCR,
175    "utterly": B_INCR,
176    "very": B_INCR,
177    "almost": B_DECR,
178    "barely": B_DECR,
179    "hardly": B_DECR,
180    "just enough": B_DECR,
181    "kind of": B_DECR,
182    "kinda": B_DECR,
183    "kindof": B_DECR,
184    "kind-of": B_DECR,
185    "less": B_DECR,
186    "little": B_DECR,
187    "marginally": B_DECR,
188    "occasionally": B_DECR,
189    "partly": B_DECR,
190    "scarcely": B_DECR,
191    "slightly": B_DECR,
192    "somewhat": B_DECR,
193    "sort of": B_DECR,
194    "sorta": B_DECR,
195    "sortof": B_DECR,
196    "sort-of": B_DECR,
197}
198
199# check for special case idioms using a sentiment-laden keyword known to SAGE
200SPECIAL_CASE_IDIOMS = {
201    "the shit": 3,
202    "the bomb": 3,
203    "bad ass": 1.5,
204    "yeah right": -2,
205    "cut the mustard": 2,
206    "kiss of death": -1.5,
207    "hand to mouth": -2,
208}
209
210
211##Static methods##
212
213
214def negated(input_words, include_nt=True):
215    """
216    Determine if input contains negation words
217    """
218    neg_words = NEGATE
219    if any(word.lower() in neg_words for word in input_words):
220        return True
221    if include_nt:
222        if any("n't" in word.lower() for word in input_words):
223            return True
224    for first, second in pairwise(input_words):
225        if second.lower() == "least" and first.lower() != 'at':
226            return True
227    return False
228
229
230def normalize(score, alpha=15):
231    """
232    Normalize the score to be between -1 and 1 using an alpha that
233    approximates the max expected value
234    """
235    norm_score = score / math.sqrt((score * score) + alpha)
236    return norm_score
237
238
239def allcap_differential(words):
240    """
241    Check whether just some words in the input are ALL CAPS
242
243    :param list words: The words to inspect
244    :returns: `True` if some but not all items in `words` are ALL CAPS
245    """
246    is_different = False
247    allcap_words = 0
248    for word in words:
249        if word.isupper():
250            allcap_words += 1
251    cap_differential = len(words) - allcap_words
252    if 0 < cap_differential < len(words):
253        is_different = True
254    return is_different
255
256
257def scalar_inc_dec(word, valence, is_cap_diff):
258    """
259    Check if the preceding words increase, decrease, or negate/nullify the
260    valence
261    """
262    scalar = 0.0
263    word_lower = word.lower()
264    if word_lower in BOOSTER_DICT:
265        scalar = BOOSTER_DICT[word_lower]
266        if valence < 0:
267            scalar *= -1
268        # check if booster/dampener word is in ALLCAPS (while others aren't)
269        if word.isupper() and is_cap_diff:
270            if valence > 0:
271                scalar += C_INCR
272            else:
273                scalar -= C_INCR
274    return scalar
275
276
277class SentiText(object):
278    """
279    Identify sentiment-relevant string-level properties of input text.
280    """
281
282    def __init__(self, text):
283        if not isinstance(text, str):
284            text = str(text.encode('utf-8'))
285        self.text = text
286        self.words_and_emoticons = self._words_and_emoticons()
287        # doesn't separate words from\
288        # adjacent punctuation (keeps emoticons & contractions)
289        self.is_cap_diff = allcap_differential(self.words_and_emoticons)
290
291    def _words_plus_punc(self):
292        """
293        Returns mapping of form:
294        {
295            'cat,': 'cat',
296            ',cat': 'cat',
297        }
298        """
299        no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
300        # removes punctuation (but loses emoticons & contractions)
301        words_only = no_punc_text.split()
302        # remove singletons
303        words_only = set(w for w in words_only if len(w) > 1)
304        # the product gives ('cat', ',') and (',', 'cat')
305        punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)}
306        punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)}
307        words_punc_dict = punc_before
308        words_punc_dict.update(punc_after)
309        return words_punc_dict
310
311    def _words_and_emoticons(self):
312        """
313        Removes leading and trailing puncutation
314        Leaves contractions and most emoticons
315            Does not preserve punc-plus-letter emoticons (e.g. :D)
316        """
317        wes = self.text.split()
318        words_punc_dict = self._words_plus_punc()
319        wes = [we for we in wes if len(we) > 1]
320        for i, we in enumerate(wes):
321            if we in words_punc_dict:
322                wes[i] = words_punc_dict[we]
323        return wes
324
325
326class SentimentIntensityAnalyzer(object):
327    """
328    Give a sentiment intensity score to sentences.
329    """
330
331    def __init__(
332        self, lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt"
333    ):
334        self.lexicon_file = nltk.data.load(lexicon_file)
335        self.lexicon = self.make_lex_dict()
336
337    def make_lex_dict(self):
338        """
339        Convert lexicon file to a dictionary
340        """
341        lex_dict = {}
342        for line in self.lexicon_file.split('\n'):
343            (word, measure) = line.strip().split('\t')[0:2]
344            lex_dict[word] = float(measure)
345        return lex_dict
346
347    def polarity_scores(self, text):
348        """
349        Return a float for sentiment strength based on the input text.
350        Positive values are positive valence, negative value are negative
351        valence.
352        """
353        sentitext = SentiText(text)
354        # text, words_and_emoticons, is_cap_diff = self.preprocess(text)
355
356        sentiments = []
357        words_and_emoticons = sentitext.words_and_emoticons
358        for item in words_and_emoticons:
359            valence = 0
360            i = words_and_emoticons.index(item)
361            if (
362                i < len(words_and_emoticons) - 1
363                and item.lower() == "kind"
364                and words_and_emoticons[i + 1].lower() == "of"
365            ) or item.lower() in BOOSTER_DICT:
366                sentiments.append(valence)
367                continue
368
369            sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)
370
371        sentiments = self._but_check(words_and_emoticons, sentiments)
372
373        return self.score_valence(sentiments, text)
374
375    def sentiment_valence(self, valence, sentitext, item, i, sentiments):
376        is_cap_diff = sentitext.is_cap_diff
377        words_and_emoticons = sentitext.words_and_emoticons
378        item_lowercase = item.lower()
379        if item_lowercase in self.lexicon:
380            # get the sentiment valence
381            valence = self.lexicon[item_lowercase]
382
383            # check if sentiment laden word is in ALL CAPS (while others aren't)
384            if item.isupper() and is_cap_diff:
385                if valence > 0:
386                    valence += C_INCR
387                else:
388                    valence -= C_INCR
389
390            for start_i in range(0, 3):
391                if (
392                    i > start_i
393                    and words_and_emoticons[i - (start_i + 1)].lower()
394                    not in self.lexicon
395                ):
396                    # dampen the scalar modifier of preceding words and emoticons
397                    # (excluding the ones that immediately preceed the item) based
398                    # on their distance from the current item.
399                    s = scalar_inc_dec(
400                        words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
401                    )
402                    if start_i == 1 and s != 0:
403                        s = s * 0.95
404                    if start_i == 2 and s != 0:
405                        s = s * 0.9
406                    valence = valence + s
407                    valence = self._never_check(
408                        valence, words_and_emoticons, start_i, i
409                    )
410                    if start_i == 2:
411                        valence = self._idioms_check(valence, words_and_emoticons, i)
412
413                        # future work: consider other sentiment-laden idioms
414                        # other_idioms =
415                        # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2,
416                        #  "upper hand": 1, "break a leg": 2,
417                        #  "cooking with gas": 2, "in the black": 2, "in the red": -2,
418                        #  "on the ball": 2,"under the weather": -2}
419
420            valence = self._least_check(valence, words_and_emoticons, i)
421
422        sentiments.append(valence)
423        return sentiments
424
425    def _least_check(self, valence, words_and_emoticons, i):
426        # check for negation case using "least"
427        if (
428            i > 1
429            and words_and_emoticons[i - 1].lower() not in self.lexicon
430            and words_and_emoticons[i - 1].lower() == "least"
431        ):
432            if (
433                words_and_emoticons[i - 2].lower() != "at"
434                and words_and_emoticons[i - 2].lower() != "very"
435            ):
436                valence = valence * N_SCALAR
437        elif (
438            i > 0
439            and words_and_emoticons[i - 1].lower() not in self.lexicon
440            and words_and_emoticons[i - 1].lower() == "least"
441        ):
442            valence = valence * N_SCALAR
443        return valence
444
445    def _but_check(self, words_and_emoticons, sentiments):
446        # check for modification in sentiment due to contrastive conjunction 'but'
447        if 'but' in words_and_emoticons or 'BUT' in words_and_emoticons:
448            try:
449                bi = words_and_emoticons.index('but')
450            except ValueError:
451                bi = words_and_emoticons.index('BUT')
452            for sentiment in sentiments:
453                si = sentiments.index(sentiment)
454                if si < bi:
455                    sentiments.pop(si)
456                    sentiments.insert(si, sentiment * 0.5)
457                elif si > bi:
458                    sentiments.pop(si)
459                    sentiments.insert(si, sentiment * 1.5)
460        return sentiments
461
462    def _idioms_check(self, valence, words_and_emoticons, i):
463        onezero = "{0} {1}".format(words_and_emoticons[i - 1], words_and_emoticons[i])
464
465        twoonezero = "{0} {1} {2}".format(
466            words_and_emoticons[i - 2],
467            words_and_emoticons[i - 1],
468            words_and_emoticons[i],
469        )
470
471        twoone = "{0} {1}".format(
472            words_and_emoticons[i - 2], words_and_emoticons[i - 1]
473        )
474
475        threetwoone = "{0} {1} {2}".format(
476            words_and_emoticons[i - 3],
477            words_and_emoticons[i - 2],
478            words_and_emoticons[i - 1],
479        )
480
481        threetwo = "{0} {1}".format(
482            words_and_emoticons[i - 3], words_and_emoticons[i - 2]
483        )
484
485        sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]
486
487        for seq in sequences:
488            if seq in SPECIAL_CASE_IDIOMS:
489                valence = SPECIAL_CASE_IDIOMS[seq]
490                break
491
492        if len(words_and_emoticons) - 1 > i:
493            zeroone = "{0} {1}".format(
494                words_and_emoticons[i], words_and_emoticons[i + 1]
495            )
496            if zeroone in SPECIAL_CASE_IDIOMS:
497                valence = SPECIAL_CASE_IDIOMS[zeroone]
498        if len(words_and_emoticons) - 1 > i + 1:
499            zeroonetwo = "{0} {1} {2}".format(
500                words_and_emoticons[i],
501                words_and_emoticons[i + 1],
502                words_and_emoticons[i + 2],
503            )
504            if zeroonetwo in SPECIAL_CASE_IDIOMS:
505                valence = SPECIAL_CASE_IDIOMS[zeroonetwo]
506
507        # check for booster/dampener bi-grams such as 'sort of' or 'kind of'
508        if threetwo in BOOSTER_DICT or twoone in BOOSTER_DICT:
509            valence = valence + B_DECR
510        return valence
511
512    def _never_check(self, valence, words_and_emoticons, start_i, i):
513        if start_i == 0:
514            if negated([words_and_emoticons[i - 1]]):
515                valence = valence * N_SCALAR
516        if start_i == 1:
517            if words_and_emoticons[i - 2] == "never" and (
518                words_and_emoticons[i - 1] == "so"
519                or words_and_emoticons[i - 1] == "this"
520            ):
521                valence = valence * 1.5
522            elif negated([words_and_emoticons[i - (start_i + 1)]]):
523                valence = valence * N_SCALAR
524        if start_i == 2:
525            if (
526                words_and_emoticons[i - 3] == "never"
527                and (
528                    words_and_emoticons[i - 2] == "so"
529                    or words_and_emoticons[i - 2] == "this"
530                )
531                or (
532                    words_and_emoticons[i - 1] == "so"
533                    or words_and_emoticons[i - 1] == "this"
534                )
535            ):
536                valence = valence * 1.25
537            elif negated([words_and_emoticons[i - (start_i + 1)]]):
538                valence = valence * N_SCALAR
539        return valence
540
541    def _punctuation_emphasis(self, sum_s, text):
542        # add emphasis from exclamation points and question marks
543        ep_amplifier = self._amplify_ep(text)
544        qm_amplifier = self._amplify_qm(text)
545        punct_emph_amplifier = ep_amplifier + qm_amplifier
546        return punct_emph_amplifier
547
548    def _amplify_ep(self, text):
549        # check for added emphasis resulting from exclamation points (up to 4 of them)
550        ep_count = text.count("!")
551        if ep_count > 4:
552            ep_count = 4
553        # (empirically derived mean sentiment intensity rating increase for
554        # exclamation points)
555        ep_amplifier = ep_count * 0.292
556        return ep_amplifier
557
558    def _amplify_qm(self, text):
559        # check for added emphasis resulting from question marks (2 or 3+)
560        qm_count = text.count("?")
561        qm_amplifier = 0
562        if qm_count > 1:
563            if qm_count <= 3:
564                # (empirically derived mean sentiment intensity rating increase for
565                # question marks)
566                qm_amplifier = qm_count * 0.18
567            else:
568                qm_amplifier = 0.96
569        return qm_amplifier
570
571    def _sift_sentiment_scores(self, sentiments):
572        # want separate positive versus negative sentiment scores
573        pos_sum = 0.0
574        neg_sum = 0.0
575        neu_count = 0
576        for sentiment_score in sentiments:
577            if sentiment_score > 0:
578                pos_sum += (
579                    float(sentiment_score) + 1
580                )  # compensates for neutral words that are counted as 1
581            if sentiment_score < 0:
582                neg_sum += (
583                    float(sentiment_score) - 1
584                )  # when used with math.fabs(), compensates for neutrals
585            if sentiment_score == 0:
586                neu_count += 1
587        return pos_sum, neg_sum, neu_count
588
589    def score_valence(self, sentiments, text):
590        if sentiments:
591            sum_s = float(sum(sentiments))
592            # compute and add emphasis from punctuation in text
593            punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
594            if sum_s > 0:
595                sum_s += punct_emph_amplifier
596            elif sum_s < 0:
597                sum_s -= punct_emph_amplifier
598
599            compound = normalize(sum_s)
600            # discriminate between positive, negative and neutral sentiment scores
601            pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
602
603            if pos_sum > math.fabs(neg_sum):
604                pos_sum += punct_emph_amplifier
605            elif pos_sum < math.fabs(neg_sum):
606                neg_sum -= punct_emph_amplifier
607
608            total = pos_sum + math.fabs(neg_sum) + neu_count
609            pos = math.fabs(pos_sum / total)
610            neg = math.fabs(neg_sum / total)
611            neu = math.fabs(neu_count / total)
612
613        else:
614            compound = 0.0
615            pos = 0.0
616            neg = 0.0
617            neu = 0.0
618
619        sentiment_dict = {
620            "neg": round(neg, 3),
621            "neu": round(neu, 3),
622            "pos": round(pos, 3),
623            "compound": round(compound, 4),
624        }
625
626        return sentiment_dict
627