1""" 2Example for using langid.py to identify the language of messages 3on a twitter livestream. Optionally, it can also filter messages 4and display only those in a target language(s). 5 6Expects a Twitterstream on STDIN, such as the one provided by: 7 8# curl https://stream.twitter.com/1/statuses/sample.json -u<username> -s 9 10Outputs lang:message one-per-line to STDOUT 11 12Marco Lui, June 2012 13""" 14 15import sys 16import langid 17import json 18import optparse 19import re 20 21import _twokenize 22 23 24to_clean = re.compile(_twokenize.regex_or( 25 _twokenize.Hearts, 26 _twokenize.url, 27 _twokenize.Email, 28 _twokenize.emoticon, 29 _twokenize.Arrows, 30 _twokenize.entity, 31 _twokenize.decorations, 32 _twokenize.Hashtag, 33 _twokenize.AtMention, 34).decode('utf8'), re.UNICODE) 35 36 37def clean_tweet(text): 38 return to_clean.sub('', text) 39 40 41def squeeze_whitespace(text): 42 return re.sub('\s+', ' ', text) 43 44 45if __name__ == "__main__": 46 parser = optparse.OptionParser() 47 parser.add_option('-l', '--langs', dest='langs', help='comma-separated set of target ISO639 language codes (e.g en,de)') 48 opts, args = parser.parse_args() 49 50 lang_set = set(opts.langs.split(",")) if opts.langs else None 51 52 try: 53 for line in sys.stdin: 54 j = json.loads(line) 55 if j.get('retweet_count') == 0: 56 text = j.get('text') 57 if text: 58 lang, conf = langid.classify(clean_tweet(text)) 59 if lang_set is None or lang in lang_set: 60 print "{0}: {1}".format(lang, squeeze_whitespace(text).encode('utf8')) 61 except (IOError, KeyboardInterrupt): 62 # Terminate on broken pipe or ^C 63 pass 64 65