1import json
2import os
3
4from nltk.tokenize import TreebankWordTokenizer, sent_tokenize
5
6t = TreebankWordTokenizer()
7with open(os.path.join('testdata', 'tokenize.json')) as d:
8    data = json.load(d)
9
10words = []
11sents = []
12for text in data:
13    for s in sent_tokenize(text):
14        sents.append(s)
15        words.append(t.tokenize(s))
16
17with open(os.path.join('testdata', 'treebank_words.json'), 'w') as f:
18    json.dump(words, f, indent=4)
19
20with open(os.path.join('testdata', 'treebank_sents.json'), 'w') as f:
21    json.dump(sents, f, indent=4)
22