1.. Copyright (C) 2001-2019 NLTK Project 2.. For license information, see LICENSE.TXT 3 4 >>> from __future__ import print_function 5 >>> from nltk.tokenize import * 6 7Regression Tests: Treebank Tokenizer 8~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 9 10Some test strings. 11 12 >>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88." 13 >>> word_tokenize(s1) 14 ['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.'] 15 >>> s2 = "\"We beat some pretty good teams to get here,\" Slocum said." 16 >>> word_tokenize(s2) 17 ['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.'] 18 >>> s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't." 19 >>> word_tokenize(s3) 20 ['Well', ',', 'we', 'could', "n't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', "n't", '.'] 21 >>> s4 = "I cannot cannot work under these conditions!" 22 >>> word_tokenize(s4) 23 ['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!'] 24 >>> s5 = "The company spent $30,000,000 last year." 25 >>> word_tokenize(s5) 26 ['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.'] 27 >>> s6 = "The company spent 40.75% of its income last year." 28 >>> word_tokenize(s6) 29 ['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.'] 30 >>> s7 = "He arrived at 3:00 pm." 31 >>> word_tokenize(s7) 32 ['He', 'arrived', 'at', '3:00', 'pm', '.'] 33 >>> s8 = "I bought these items: books, pencils, and pens." 34 >>> word_tokenize(s8) 35 ['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.'] 36 >>> s9 = "Though there were 150, 100 of them were old." 37 >>> word_tokenize(s9) 38 ['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.'] 39 >>> s10 = "There were 300,000, but that wasn't enough." 40 >>> word_tokenize(s10) 41 ['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.'] 42 43 44Testing improvement made to the TreebankWordTokenizer 45 46 >>> sx1 = u'\xabNow that I can do.\xbb' 47 >>> expected = [u'\xab', u'Now', u'that', u'I', u'can', u'do', u'.', u'\xbb'] 48 >>> word_tokenize(sx1) == expected 49 True 50 >>> sx2 = u'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.' 51 >>> expected = [u'The', u'unicode', u'201C', u'and', u'201D', u'\u201c', u'LEFT', u'(', u'RIGHT', u')', u'DOUBLE', u'QUOTATION', u'MARK', u'\u201d', u'is', u'also', u'OPEN_PUNCT', u'and', u'CLOSE_PUNCT', u'.'] 52 >>> word_tokenize(sx2) == expected 53 True 54 55 56Sentence tokenization in word_tokenize: 57 58 >>> s11 = "I called Dr. Jones. I called Dr. Jones." 59 >>> word_tokenize(s11) 60 ['I', 'called', 'Dr.', 'Jones', '.', 'I', 'called', 'Dr.', 'Jones', '.'] 61 >>> s12 = ("Ich muss unbedingt daran denken, Mehl, usw. fur einen " 62 ... "Kuchen einzukaufen. Ich muss.") 63 >>> word_tokenize(s12) 64 ['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw', 65 '.', 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.'] 66 >>> word_tokenize(s12, 'german') 67 ['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw.', 68 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.'] 69 70 71Regression Tests: Regexp Tokenizer 72~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 73 74Some additional test strings. 75 76 >>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n" 77 ... "two of them.\n\nThanks.") 78 >>> s2 = ("Alas, it has not rained today. When, do you think, " 79 ... "will it rain again?") 80 >>> s3 = ("<p>Although this is <b>not</b> the case here, we must " 81 ... "not relax our vigilance!</p>") 82 83 >>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False) 84 [', ', '. ', ', ', ', ', '?'] 85 >>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True) 86 ['Alas', 'it has not rained today', 'When', 'do you think', 87 'will it rain again'] 88 89Take care to avoid using capturing groups: 90 91 >>> regexp_tokenize(s3, r'</?[bp]>', gaps=False) 92 ['<p>', '<b>', '</b>', '</p>'] 93 >>> regexp_tokenize(s3, r'</?(?:b|p)>', gaps=False) 94 ['<p>', '<b>', '</b>', '</p>'] 95 >>> regexp_tokenize(s3, r'</?(?:b|p)>', gaps=True) 96 ['Although this is ', 'not', 97 ' the case here, we must not relax our vigilance!'] 98 99Named groups are capturing groups, and confuse the tokenizer: 100 101 >>> regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=False) 102 ['p', 'b', 'b', 'p'] 103 >>> regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=True) 104 ['p', 'Although this is ', 'b', 'not', 'b', 105 ' the case here, we must not relax our vigilance!', 'p'] 106 107Make sure that nested groups don't confuse the tokenizer: 108 109 >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=False) 110 ['las', 'has', 'rai', 'rai'] 111 >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=True) 112 ['A', ', it ', ' not ', 'ned today. When, do you think, will it ', 113 'n again?'] 114 115Back-references require capturing groups, and these are not supported: 116 117 >>> regexp_tokenize("aabbbcccc", r'(.)\1') 118 ['a', 'b', 'c', 'c'] 119 120A simple sentence tokenizer '\.(\s+|$)' 121 122 >>> regexp_tokenize(s, pattern=r'\.(?:\s+|$)', gaps=True) 123 ['Good muffins cost $3.88\nin New York', 124 'Please buy me\ntwo of them', 'Thanks'] 125 126 127Regression Tests: TweetTokenizer 128~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 129 130TweetTokenizer is a tokenizer specifically designed for micro-blogging tokenization tasks. 131 132 >>> from nltk.tokenize import TweetTokenizer 133 >>> tknzr = TweetTokenizer() 134 >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" 135 >>> tknzr.tokenize(s0) 136 ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'] 137 >>> s1 = "@Joyster2012 @CathStaincliffe Good for you, girl!! Best wishes :-)" 138 >>> tknzr.tokenize(s1) 139 ['@Joyster2012', '@CathStaincliffe', 'Good', 'for', 'you', ',', 'girl', '!', '!', 'Best', 'wishes', ':-)'] 140 >>> s2 = "3Points for #DreamTeam Gooo BAILEY! :) #PBB737Gold @PBBabscbn" 141 >>> tknzr.tokenize(s2) 142 ['3Points', 'for', '#DreamTeam', 'Gooo', 'BAILEY', '!', ':)', '#PBB737Gold', '@PBBabscbn'] 143 >>> s3 = "@Insanomania They do... Their mentality doesn't :(" 144 >>> tknzr.tokenize(s3) 145 ['@Insanomania', 'They', 'do', '...', 'Their', 'mentality', "doesn't", ':('] 146 >>> s4 = "RT @facugambande: Ya por arrancar a grabar !!! #TirenTirenTiren vamoo !!" 147 >>> tknzr.tokenize(s4) 148 ['RT', '@facugambande', ':', 'Ya', 'por', 'arrancar', 'a', 'grabar', '!', '!', '!', '#TirenTirenTiren', 'vamoo', '!', '!'] 149 >>> tknzr = TweetTokenizer(reduce_len=True) 150 >>> s5 = "@crushinghes the summer holidays are great but I'm so bored already :(" 151 >>> tknzr.tokenize(s5) 152 ['@crushinghes', 'the', 'summer', 'holidays', 'are', 'great', 'but', "I'm", 'so', 'bored', 'already', ':('] 153 154It is possible to specify `strip_handles` and `reduce_len` parameters for a TweetTokenizer instance. Setting `strip_handles` to True, the tokenizer will remove Twitter handles (e.g. usernames). Setting `reduce_len` to True, repeated character sequences of length 3 or greater will be replaced with sequences of length 3. 155 156 >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) 157 >>> s6 = '@remy: This is waaaaayyyy too much for you!!!!!!' 158 >>> tknzr.tokenize(s6) 159 [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!'] 160 >>> s7 = '@_willy65: No place for @chuck tonight. Sorry.' 161 >>> tknzr.tokenize(s7) 162 [':', 'No', 'place', 'for', 'tonight', '.', 'Sorry', '.'] 163 >>> s8 = '@mar_tin is a great developer. Contact him at mar_tin@email.com.' 164 >>> tknzr.tokenize(s8) 165 ['is', 'a', 'great', 'developer', '.', 'Contact', 'him', 'at', 'mar_tin@email.com', '.'] 166 167The `preserve_case` parameter (default: True) allows to convert uppercase tokens to lowercase tokens. Emoticons are not affected: 168 169 >>> tknzr = TweetTokenizer(preserve_case=False) 170 >>> s9 = "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P" 171 >>> tknzr.tokenize(s9) 172 ['@jrmy', ':', "i'm", 'really', 'happyyy', 'about', 'that', '!', 'niceeee', ':D', ':P'] 173 174It should not hang on long sequences of the same punctuation character. 175 176 >>> tknzr = TweetTokenizer() 177 >>> s10 = "Photo: Aujourd'hui sur http://t.co/0gebOFDUzn Projet... http://t.co/bKfIUbydz2.............................. http://fb.me/3b6uXpz0L" 178 >>> tknzr.tokenize(s10) 179 [u'Photo', u':', u"Aujourd'hui", u'sur', u'http://t.co/0gebOFDUzn', u'Projet', u'...', u'http://t.co/bKfIUbydz2', u'...', u'http://fb.me/3b6uXpz0L'] 180 181 182Regression Tests: PunktSentenceTokenizer 183~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 184 185The sentence splitter should remove whitespace following the sentence boundary. 186 187 >>> pst = PunktSentenceTokenizer() 188 >>> pst.tokenize('See Section 3). Or Section 2). ') 189 ['See Section 3).', 'Or Section 2).'] 190 >>> pst.tokenize('See Section 3.) Or Section 2.) ') 191 ['See Section 3.)', 'Or Section 2.)'] 192 >>> pst.tokenize('See Section 3.) Or Section 2.) ', realign_boundaries=False) 193 ['See Section 3.', ') Or Section 2.', ')'] 194 195 196Two instances of PunktSentenceTokenizer should not share PunktParameters. 197 198 >>> pst = PunktSentenceTokenizer() 199 >>> pst2 = PunktSentenceTokenizer() 200 >>> pst._params is pst2._params 201 False 202 203Testing mutable default arguments for https://github.com/nltk/nltk/pull/2067 204 205 >>> from nltk.tokenize.punkt import PunktBaseClass, PunktTrainer, PunktSentenceTokenizer 206 >>> from nltk.tokenize.punkt import PunktLanguageVars, PunktParameters 207 >>> pbc = PunktBaseClass(lang_vars=None, params=None) 208 >>> type(pbc._params) 209 <class 'nltk.tokenize.punkt.PunktParameters'> 210 >>> type(pbc._lang_vars) 211 <class 'nltk.tokenize.punkt.PunktLanguageVars'> 212 >>> pt = PunktTrainer(lang_vars=None) 213 >>> type(pt._lang_vars) 214 <class 'nltk.tokenize.punkt.PunktLanguageVars'> 215 >>> pst = PunktSentenceTokenizer(lang_vars=None) 216 >>> type(pst._lang_vars) 217 <class 'nltk.tokenize.punkt.PunktLanguageVars'> 218 219 220Regression Tests: align_tokens 221~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 222Post-hoc alignment of tokens with a source string 223 224 >>> from nltk.tokenize.util import align_tokens 225 >>> list(align_tokens([''], "")) 226 [(0, 0)] 227 >>> list(align_tokens([''], " ")) 228 [(0, 0)] 229 >>> list(align_tokens([], "")) 230 [] 231 >>> list(align_tokens([], " ")) 232 [] 233 >>> list(align_tokens(['a'], "a")) 234 [(0, 1)] 235 >>> list(align_tokens(['abc', 'def'], "abcdef")) 236 [(0, 3), (3, 6)] 237 >>> list(align_tokens(['abc', 'def'], "abc def")) 238 [(0, 3), (4, 7)] 239 >>> list(align_tokens(['ab', 'cd'], "ab cd ef")) 240 [(0, 2), (3, 5)] 241 >>> list(align_tokens(['ab', 'cd', 'ef'], "ab cd ef")) 242 [(0, 2), (3, 5), (6, 8)] 243 >>> list(align_tokens(['ab', 'cd', 'efg'], "ab cd ef")) 244 Traceback (most recent call last): 245 .... 246 ValueError: substring "efg" not found in "ab cd ef" 247 >>> list(align_tokens(['ab', 'cd', 'ef', 'gh'], "ab cd ef")) 248 Traceback (most recent call last): 249 .... 250 ValueError: substring "gh" not found in "ab cd ef" 251 >>> list(align_tokens(['The', 'plane', ',', 'bound', 'for', 'St', 'Petersburg', ',', 'crashed', 'in', 'Egypt', "'s", 'Sinai', 'desert', 'just', '23', 'minutes', 'after', 'take-off', 'from', 'Sharm', 'el-Sheikh', 'on', 'Saturday', '.'], "The plane, bound for St Petersburg, crashed in Egypt's Sinai desert just 23 minutes after take-off from Sharm el-Sheikh on Saturday.")) 252 [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23), (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54), (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89), (90, 98), (99, 103), (104, 109), (110, 119), (120, 122), (123, 131), (131, 132)] 253 254 255Regression Tests: MWETokenizer 256~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 257Pickle an MWETokenizer 258 259 >>> from nltk.tokenize import MWETokenizer 260 >>> import pickle 261 262 >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+') 263 >>> p = pickle.dumps(tokenizer) 264 >>> unpickeled = pickle.loads(p) 265 >>> unpickeled.tokenize("An hors d'oeuvre tonight, sir?".split()) 266 ['An', "hors+d'oeuvre", 'tonight,', 'sir?'] 267 268 269Regression Tests: TextTilingTokenizer 270~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 271 272TextTilingTokneizer tokenizes text into coherent subtopic chunks based upon Hearst's TextTiling algorithm. 273 274 >>> from nltk.tokenize import TextTilingTokenizer 275 >>> from nltk.corpus import brown 276 >>> tt = TextTilingTokenizer() 277 >>> tt.tokenize(brown.raw()[0:1000]) 278 ["\n\n\tThe/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./.\n\n\n\tThe/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/cc thanks/nns of/in the/at City/nn-tl of/in-tl Atlanta/np-tl ''/'' for/in the/at manner/nn in/in which/wdt the/at election/nn was/bedz conducted/vbn ./.\n\n\n\tThe/at September-October/np term/nn jury/nn had/hvd been/ben charged/vbn by/in Fulton/np-tl Superior/jj-tl Court/nn-tl Judge/nn-tl Durwood/np Pye/np to/to investigate/vb reports/nns of/in possible/jj ``/`` irregularities/nns ''/'' in/in the/at hard-fought/jj primary/nn which/wdt was/bedz won/vbn by/in Mayor-nominate/nn-tl Ivan/np Allen/np Jr./"] 279 280Test that `ValueError` exceptions are raised when illegal arguments are used. 281 282 >>> TextTilingTokenizer(similarity_method='foo').tokenize(brown.raw()[0:1000]) 283 Traceback (most recent call last): 284 ... 285 ValueError: Similarity method foo not recognized 286 >>> TextTilingTokenizer(smoothing_method='bar').tokenize(brown.raw()[0:1000]) 287 Traceback (most recent call last): 288 ... 289 ValueError: Smoothing method bar not recognized 290 291