1"""
2Example for using langid.py to identify the language of messages
3on a twitter livestream. Optionally, it can also filter messages
4and display only those in a target language(s).
5
6Expects a Twitterstream on STDIN, such as the one provided by:
7
8# curl https://stream.twitter.com/1/statuses/sample.json -u<username> -s
9
10Outputs lang:message one-per-line to STDOUT
11
12Marco Lui, June 2012
13"""
14
15import sys
16import langid
17import json
18import optparse
19import re
20
21import _twokenize
22
23
24to_clean = re.compile(_twokenize.regex_or(
25  _twokenize.Hearts,
26  _twokenize.url,
27  _twokenize.Email,
28  _twokenize.emoticon,
29  _twokenize.Arrows,
30  _twokenize.entity,
31  _twokenize.decorations,
32  _twokenize.Hashtag,
33  _twokenize.AtMention,
34).decode('utf8'), re.UNICODE)
35
36
37def clean_tweet(text):
38  return to_clean.sub('', text)
39
40
41def squeeze_whitespace(text):
42  return re.sub('\s+', ' ', text)
43
44
45if __name__ == "__main__":
46  parser = optparse.OptionParser()
47  parser.add_option('-l', '--langs', dest='langs', help='comma-separated set of target ISO639 language codes (e.g en,de)')
48  opts, args = parser.parse_args()
49
50  lang_set = set(opts.langs.split(",")) if opts.langs else None
51
52  try:
53    for line in sys.stdin:
54      j = json.loads(line)
55      if j.get('retweet_count') == 0:
56        text = j.get('text')
57        if text:
58          lang, conf = langid.classify(clean_tweet(text))
59          if lang_set is None or lang in lang_set:
60            print "{0}: {1}".format(lang, squeeze_whitespace(text).encode('utf8'))
61  except (IOError, KeyboardInterrupt):
62    # Terminate on broken pipe or ^C
63    pass
64
65