1# -*- coding: utf-8 -*- 2# Natural Language Toolkit: Python port of the tok-tok.pl tokenizer. 3# 4# Copyright (C) 2001-2015 NLTK Project 5# Author: Jon Dehdari 6# Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters 7# 8# URL: <http://nltk.sourceforge.net> 9# For license information, see LICENSE.TXT 10 11""" 12The tok-tok tokenizer is a simple, general tokenizer, where the input has one 13sentence per line; thus only final period is tokenized. 14 15Tok-tok has been tested on, and gives reasonably good results for English, 16Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others. 17The input should be in UTF-8 encoding. 18 19Reference: 20Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language 21Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University. 22""" 23 24import re 25from six import text_type 26 27from nltk.tokenize.api import TokenizerI 28 29 30class ToktokTokenizer(TokenizerI): 31 """ 32 This is a Python port of the tok-tok.pl from 33 https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl 34 35 >>> toktok = ToktokTokenizer() 36 >>> text = u'Is 9.5 or 525,600 my favorite number?' 37 >>> print (toktok.tokenize(text, return_str=True)) 38 Is 9.5 or 525,600 my favorite number ? 39 >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things' 40 >>> print (toktok.tokenize(text, return_str=True)) 41 The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things 42 >>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf' 43 >>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf' 44 >>> assert toktok.tokenize(text, return_str=True) == expected 45 >>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf'] 46 True 47 """ 48 49 # Replace non-breaking spaces with normal spaces. 50 NON_BREAKING = re.compile(u"\u00A0"), " " 51 52 # Pad some funky punctuation. 53 FUNKY_PUNCT_1 = re.compile(u'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 " 54 # Pad more funky punctuation. 55 FUNKY_PUNCT_2 = re.compile(u'([({\[“‘„‚«‹「『])'), r" \1 " 56 # Pad En dash and em dash 57 EN_EM_DASHES = re.compile(u'([–—])'), r" \1 " 58 59 # Replace problematic character with numeric character reference. 60 AMPERCENT = re.compile('& '), '& ' 61 TAB = re.compile('\t'), ' 	 ' 62 PIPE = re.compile('\|'), ' | ' 63 64 # Pad numbers with commas to keep them from further tokenization. 65 COMMA_IN_NUM = re.compile(r'(?<!,)([,،])(?![,\d])'), r' \1 ' 66 67 # Just pad problematic (often neurotic) hyphen/single quote, etc. 68 PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r' \1 ' 69 # Group ` ` stupid quotes ' ' into a single token. 70 STUPID_QUOTES_1 = re.compile(r" ` ` "), r" `` " 71 STUPID_QUOTES_2 = re.compile(r" ' ' "), r" '' " 72 73 # Don't tokenize period unless it ends the line and that it isn't 74 # preceded by another period, e.g. 75 # "something ..." -> "something ..." 76 # "something." -> "something ." 77 FINAL_PERIOD_1 = re.compile(r"(?<!\.)\.$"), r" ." 78 # Don't tokenize period unless it ends the line eg. 79 # " ... stuff." -> "... stuff ." 80 FINAL_PERIOD_2 = re.compile(r"""(?<!\.)\.\s*(["'’»›”]) *$"""), r" . \1" 81 82 # Treat continuous commas as fake German,Czech, etc.: „ 83 MULTI_COMMAS = re.compile(r'(,{2,})'), r' \1 ' 84 # Treat continuous dashes as fake en-dash, etc. 85 MULTI_DASHES = re.compile(r'(-{2,})'), r' \1 ' 86 # Treat multiple periods as a thing (eg. ellipsis) 87 MULTI_DOTS = re.compile(r'(\.{2,})'), r' \1 ' 88 89 # This is the \p{Open_Punctuation} from Perl's perluniprops 90 # see http://perldoc.perl.org/perluniprops.html 91 OPEN_PUNCT = text_type( 92 u'([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d' 93 u'\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772' 94 u'\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983' 95 u'\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993' 96 u'\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26' 97 u'\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016' 98 u'\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39' 99 u'\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b' 100 u'\ufe5d\uff08\uff3b\uff5b\uff5f\uff62' 101 ) 102 # This is the \p{Close_Punctuation} from Perl's perluniprops 103 CLOSE_PUNCT = text_type( 104 u')]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a' 105 u'\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6' 106 u'\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988' 107 u'\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998' 108 u'\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009' 109 u'\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b' 110 u'\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c' 111 u'\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e' 112 u'\uff09\uff3d\uff5d\uff60\uff63' 113 ) 114 # This is the \p{Close_Punctuation} from Perl's perluniprops 115 CURRENCY_SYM = text_type( 116 u'$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb' 117 u'\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3' 118 u'\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab' 119 u'\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3' 120 u'\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838' 121 u'\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6' 122 ) 123 124 # Pad spaces after opening punctuations. 125 OPEN_PUNCT_RE = re.compile(u'([{}])'.format(OPEN_PUNCT)), r'\1 ' 126 # Pad spaces before closing punctuations. 127 CLOSE_PUNCT_RE = re.compile(u'([{}])'.format(CLOSE_PUNCT)), r'\1 ' 128 # Pad spaces after currency symbols. 129 CURRENCY_SYM_RE = re.compile(u'([{}])'.format(CURRENCY_SYM)), r'\1 ' 130 131 # Use for tokenizing URL-unfriendly characters: [:/?#] 132 URL_FOE_1 = re.compile(r':(?!//)'), r' : ' # in perl s{:(?!//)}{ : }g; 133 URL_FOE_2 = re.compile(r'\?(?!\S)'), r' ? ' # in perl s{\?(?!\S)}{ ? }g; 134 # in perl: m{://} or m{\S+\.\S+/\S+} or s{/}{ / }g; 135 URL_FOE_3 = re.compile(r'(:\/\/)[\S+\.\S+\/\S+][\/]'), ' / ' 136 URL_FOE_4 = re.compile(r' /'), r' / ' # s{ /}{ / }g; 137 138 # Left/Right strip, i.e. remove heading/trailing spaces. 139 # These strip regexes should NOT be used, 140 # instead use str.lstrip(), str.rstrip() or str.strip() 141 # (They are kept for reference purposes to the original toktok.pl code) 142 LSTRIP = re.compile(r'^ +'), '' 143 RSTRIP = re.compile(r'\s+$'), '\n' 144 # Merge multiple spaces. 145 ONE_SPACE = re.compile(r' {2,}'), ' ' 146 147 TOKTOK_REGEXES = [ 148 NON_BREAKING, 149 FUNKY_PUNCT_1, 150 URL_FOE_1, 151 URL_FOE_2, 152 URL_FOE_3, 153 URL_FOE_4, 154 AMPERCENT, 155 TAB, 156 PIPE, 157 OPEN_PUNCT_RE, 158 CLOSE_PUNCT_RE, 159 MULTI_COMMAS, 160 COMMA_IN_NUM, 161 FINAL_PERIOD_2, 162 PROB_SINGLE_QUOTES, 163 STUPID_QUOTES_1, 164 STUPID_QUOTES_2, 165 CURRENCY_SYM_RE, 166 EN_EM_DASHES, 167 MULTI_DASHES, 168 MULTI_DOTS, 169 FINAL_PERIOD_1, 170 FINAL_PERIOD_2, 171 ONE_SPACE, 172 ] 173 174 def tokenize(self, text, return_str=False): 175 text = text_type(text) # Converts input string into unicode. 176 for regexp, subsitution in self.TOKTOK_REGEXES: 177 text = regexp.sub(subsitution, text) 178 # Finally, strips heading and trailing spaces 179 # and converts output string into unicode. 180 text = text_type(text.strip()) 181 return text if return_str else text.split() 182