1import json 2import os 3 4from nltk.tokenize import TreebankWordTokenizer, sent_tokenize 5 6t = TreebankWordTokenizer() 7with open(os.path.join('testdata', 'tokenize.json')) as d: 8 data = json.load(d) 9 10words = [] 11sents = [] 12for text in data: 13 for s in sent_tokenize(text): 14 sents.append(s) 15 words.append(t.tokenize(s)) 16 17with open(os.path.join('testdata', 'treebank_words.json'), 'w') as f: 18 json.dump(words, f, indent=4) 19 20with open(os.path.join('testdata', 'treebank_sents.json'), 'w') as f: 21 json.dump(sents, f, indent=4) 22