1# -*- coding: utf-8 -*-
2"""
3Twokenize -- a tokenizer designed for Twitter text in English and some other European languages.
4This tokenizer code has gone through a long history:
5
6(1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif
7       TweetMotif: Exploratory Search and Topic Summarization for Twitter.
8       Brendan O'Connor, Michel Krieger, and David Ahn.
9       ICWSM-2010 (demo track), http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf
10(2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger
11(2b) Jason Baldridge and David Snyder ported it to Scala
12(3) Brendan bugfixed the Scala port and merged with POS-specific changes
13    for the CMU ARK Twitter POS Tagger
14(4) Tobi Owoputi ported it back to Java and added many improvements (2012-06)
15
16Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP
17
18There have been at least 2 other Java ports, but they are not in the lineage for the code here.
19
20Ported to Python by Myle Ott <myleott@gmail.com>.
21"""
22
23from __future__ import print_function
24
25import operator
26import re
27import HTMLParser
28
29def regex_or(*items):
30    return '(?:' + '|'.join(items) + ')'
31
32Contractions = re.compile(u"(?i)(\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$", re.UNICODE)
33Whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
34
35punctChars = r"['\"“”‘’.?!…,:;]"
36#punctSeq   = punctChars+"+"	#'anthem'. => ' anthem '.
37punctSeq   = r"['\"“”‘’]+|[.?!,…]+|[:;]+"	#'anthem'. => ' anthem ' .
38entity     = r"&(?:amp|lt|gt|quot);"
39#  URLs
40
41
42# BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
43# If you actually empirically test it the results are bad.
44# Please see https://github.com/brendano/ark-tweet-nlp/pull/9
45
46urlStart1  = r"(?:https?://|\bwww\.)"
47commonTLDs = r"(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"
48ccTLDs	 = r"(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + \
49r"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + \
50r"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + \
51r"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + \
52r"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + \
53r"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + \
54r"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + \
55r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)"	#TODO: remove obscure country domains?
56urlStart2  = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
57urlBody    = r"(?:[^\.\s<>][^\s<>]*?)?"
58urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
59urlEnd     = r"(?:\.\.+|[<>]|\s|$)"
60url        = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"
61
62
63# Numeric
64timeLike   = r"\d+(?::\d+){1,2}"
65#numNum     = r"\d+\.\d+"
66numberWithCommas = r"(?:(?<!\d)\d{1,3},)+?\d{3}" + r"(?=(?:[^,\d]|$))"
67numComb	 = u"[\u0024\u058f\u060b\u09f2\u09f3\u09fb\u0af1\u0bf9\u0e3f\u17db\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6\u00a2-\u00a5\u20a0-\u20b9]?\\d+(?:\\.\\d+)+%?".encode('utf-8')
68
69# Abbreviations
70boundaryNotDot = regex_or("$", r"\s", r"[“\"?!,:;]", entity)
71aa1  = r"(?:[A-Za-z]\.){2,}(?=" + boundaryNotDot + ")"
72aa2  = r"[^A-Za-z](?:[A-Za-z]\.){1,}[A-Za-z](?=" + boundaryNotDot + ")"
73standardAbbreviations = r"\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\."
74arbitraryAbbrev = regex_or(aa1, aa2, standardAbbreviations)
75separators  = "(?:--+|―|—|~|–|=)"
76decorations = u"(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\u2639-\u263b]+|[\ue001-\uebbb]+)".encode('utf-8')
77thingsThatSplitWords = r"[^\s\.,?\"]"
78embeddedApostrophe = thingsThatSplitWords+r"+['’′]" + thingsThatSplitWords + "*"
79
80#  Emoticons
81# myleott: in Python the (?iu) flags affect the whole expression
82#normalEyes = "(?iu)[:=]" # 8 and x are eyes but cause problems
83normalEyes = "[:=]" # 8 and x are eyes but cause problems
84wink = "[;]"
85noseArea = "(?:|-|[^a-zA-Z0-9 ])" # doesn't get :'-(
86happyMouths = r"[D\)\]\}]+"
87sadMouths = r"[\(\[\{]+"
88tongue = "[pPd3]+"
89otherMouths = r"(?:[oO]+|[/\\]+|[vV]+|[Ss]+|[|]+)" # remove forward slash if http://'s aren't cleaned
90
91# mouth repetition examples:
92# @aliciakeys Put it in a love song :-))
93# @hellocalyclops =))=))=)) Oh well
94
95# myleott: try to be as case insensitive as possible, but still not perfect, e.g., o.O fails
96#bfLeft = u"(♥|0|o|°|v|\\$|t|x|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
97bfLeft = u"(♥|0|[oO]|°|[vV]|\\$|[tT]|[xX]|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
98bfCenter = r"(?:[\.]|[_-]+)"
99bfRight = r"\2"
100s3 = r"(?:--['\"])"
101s4 = r"(?:<|&lt;|>|&gt;)[\._-]+(?:<|&lt;|>|&gt;)"
102s5 = "(?:[.][_]+[.])"
103# myleott: in Python the (?i) flag affects the whole expression
104#basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
105basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
106
107eeLeft = r"[\\\ƪԄ\((<>;ヽ\-=~\*]+"
108eeRight= u"[\\-=\\);'\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+".encode('utf-8')
109eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
110eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight
111
112oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"
113
114
115emoticon = regex_or(
116        # Standard version  :) :( :] :D :P
117        "(?:>|&gt;)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),
118
119        # reversed version (: D:  use positive lookbehind to remove "(word):"
120        # because eyes on the right side is more ambiguous with the standard usage of : ;
121        regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|&lt;)?",
122
123        #inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
124        eastEmote.replace("2", "1", 1), basicface,
125        # iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
126        # TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
127
128        # myleott: o.O and O.o are two of the biggest sources of differences
129        #          between this and the Java version. One little hack won't hurt...
130        oOEmote
131)
132
133Hearts = "(?:<+/?3+)+" #the other hearts are in decorations
134
135Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+".encode('utf-8'))
136
137# BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
138# "hello (#hashtag)" ==> "hello (#hashtag )"  WRONG
139# "hello (#hashtag)" ==> "hello ( #hashtag )"  RIGHT
140# "hello (@person)" ==> "hello (@person )"  WRONG
141# "hello (@person)" ==> "hello ( @person )"  RIGHT
142# ... Some sort of weird interaction with edgepunct I guess, because edgepunct
143# has poor content-symbol detection.
144
145# This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
146# If you want good hashtag identification, use a different regex.
147Hashtag = "#[a-zA-Z0-9_]+"  #optional: lookbehind for \b
148#optional: lookbehind for \b, max length 15
149AtMention = "[@@][a-zA-Z0-9_]+"
150
151# I was worried this would conflict with at-mentions
152# but seems ok in sample of 5800: 7 changes all email fixes
153# http://www.regular-expressions.info/email.html
154Bound = r"(?:\W|^|$)"
155Email = regex_or("(?<=(?:\W))", "(?<=(?:^))") + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=" +Bound+")"
156
157# We will be tokenizing using these regexps as delimiters
158# Additionally, these things are "protected", meaning they shouldn't be further split themselves.
159Protected  = re.compile(
160    unicode(regex_or(
161        Hearts,
162        url,
163        Email,
164        timeLike,
165        #numNum,
166        numberWithCommas,
167        numComb,
168        emoticon,
169        Arrows,
170        entity,
171        punctSeq,
172        arbitraryAbbrev,
173        separators,
174        decorations,
175        embeddedApostrophe,
176        Hashtag,
177        AtMention
178    ).decode('utf-8')), re.UNICODE)
179
180# Edge punctuation
181# Want: 'foo' => ' foo '
182# While also:   don't => don't
183# the first is considered "edge punctuation".
184# the second is word-internal punctuation -- don't want to mess with it.
185# BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.
186# I remember it causing lots of trouble in the past as well.  Would be good to revisit or eliminate.
187
188# Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
189#edgePunctChars    = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
190edgePunctChars    = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
191edgePunct    = "[" + edgePunctChars + "]"
192notEdgePunct = "[a-zA-Z0-9]" # content characters
193offEdge = r"(^|$|:|;|\s|\.|,)"  # colon here gets "(hello):" ==> "( hello ):"
194EdgePunctLeft  = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
195EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)
196
197def splitEdgePunct(input):
198    input = EdgePunctLeft.sub(r"\1\2 \3", input)
199    input = EdgePunctRight.sub(r"\1 \2\3", input)
200    return input
201
202# The main work of tokenizing a tweet.
203def simpleTokenize(text):
204
205    # Do the no-brainers first
206    splitPunctText = splitEdgePunct(text)
207
208    textLength = len(splitPunctText)
209
210    # BTO: the logic here got quite convoluted via the Scala porting detour
211    # It would be good to switch back to a nice simple procedural style like in the Python version
212    # ... Scala is such a pain.  Never again.
213
214    # Find the matches for subsequences that should be protected,
215    # e.g. URLs, 1.0, U.N.K.L.E., 12:53
216    bads = []
217    badSpans = []
218    for match in Protected.finditer(splitPunctText):
219        # The spans of the "bads" should not be split.
220        if (match.start() != match.end()): #unnecessary?
221            bads.append( [splitPunctText[match.start():match.end()]] )
222            badSpans.append( (match.start(), match.end()) )
223
224    # Create a list of indices to create the "goods", which can be
225    # split. We are taking "bad" spans like
226    #     List((2,5), (8,10))
227    # to create
228    #     List(0, 2, 5, 8, 10, 12)
229    # where, e.g., "12" here would be the textLength
230    # has an even length and no indices are the same
231    indices = [0]
232    for (first, second) in badSpans:
233        indices.append(first)
234        indices.append(second)
235    indices.append(textLength)
236
237    # Group the indices and map them to their respective portion of the string
238    splitGoods = []
239    for i in range(0, len(indices), 2):
240        goodstr = splitPunctText[indices[i]:indices[i+1]]
241        splitstr = goodstr.strip().split(" ")
242        splitGoods.append(splitstr)
243
244    #  Reinterpolate the 'good' and 'bad' Lists, ensuring that
245    #  additonal tokens from last good item get included
246    zippedStr = []
247    for i in range(len(bads)):
248        zippedStr = addAllnonempty(zippedStr, splitGoods[i])
249        zippedStr = addAllnonempty(zippedStr, bads[i])
250    zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)])
251
252    # BTO: our POS tagger wants "ur" and "you're" to both be one token.
253    # Uncomment to get "you 're"
254    #splitStr = []
255    #for tok in zippedStr:
256    #    splitStr.extend(splitToken(tok))
257    #zippedStr = splitStr
258
259    return zippedStr
260
261def addAllnonempty(master, smaller):
262    for s in smaller:
263        strim = s.strip()
264        if (len(strim) > 0):
265            master.append(strim)
266    return master
267
268# "foo   bar " => "foo bar"
269def squeezeWhitespace(input):
270    return Whitespace.sub(" ", input).strip()
271
272# Final pass tokenization based on special patterns
273def splitToken(token):
274    m = Contractions.search(token)
275    if m:
276        return [m.group(1), m.group(2)]
277    return [token]
278
279# Assume 'text' has no HTML escaping.
280def tokenize(text):
281    return simpleTokenize(squeezeWhitespace(text))
282
283
284# Twitter text comes HTML-escaped, so unescape it.
285# We also first unescape &amp;'s, in case the text has been buggily double-escaped.
286def normalizeTextForTagger(text):
287    text = text.replace("&amp;", "&")
288    text = HTMLParser.HTMLParser().unescape(text)
289    return text
290
291# This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
292#
293# This function normalizes the input text BEFORE calling the tokenizer.
294# So the tokens you get back may not exactly correspond to
295# substrings of the original text.
296def tokenizeRawTweetText(text):
297    tokens = tokenize(normalizeTextForTagger(text))
298    return tokens
299
300