1package tokenize 2 3import ( 4 "encoding/json" 5 "fmt" 6 "path/filepath" 7 "testing" 8 9 "github.com/jdkato/prose/internal/util" 10 "github.com/stretchr/testify/assert" 11) 12 13var testdata = filepath.Join("..", "testdata") 14 15func getWordData(file string) ([]string, [][]string) { 16 in := util.ReadDataFile(filepath.Join(testdata, "treebank_sents.json")) 17 out := util.ReadDataFile(filepath.Join(testdata, file)) 18 19 input := []string{} 20 output := [][]string{} 21 22 util.CheckError(json.Unmarshal(in, &input)) 23 util.CheckError(json.Unmarshal(out, &output)) 24 25 return input, output 26} 27 28func getWordBenchData() []string { 29 in := util.ReadDataFile(filepath.Join(testdata, "treebank_sents.json")) 30 input := []string{} 31 util.CheckError(json.Unmarshal(in, &input)) 32 return input 33} 34 35func ExampleNewWordBoundaryTokenizer() { 36 t := NewWordBoundaryTokenizer() 37 fmt.Println(t.Tokenize("They'll save and invest more.")) 38 // Output: [They'll save and invest more] 39} 40 41func ExampleNewWordPunctTokenizer() { 42 t := NewWordPunctTokenizer() 43 fmt.Println(t.Tokenize("They'll save and invest more.")) 44 // Output: [They ' ll save and invest more .] 45} 46 47func ExampleNewTreebankWordTokenizer() { 48 t := NewTreebankWordTokenizer() 49 fmt.Println(t.Tokenize("They'll save and invest more.")) 50 // Output: [They 'll save and invest more .] 51} 52 53func ExampleNewBlanklineTokenizer() { 54 t := NewBlanklineTokenizer() 55 fmt.Println(t.Tokenize("They'll save and invest more.\n\nThanks!")) 56 // Output: [They'll save and invest more. Thanks!] 57} 58 59func TestTextToWords(t *testing.T) { 60 text := "Vale is a natural language linter that supports plain text, markup (Markdown, reStructuredText, AsciiDoc, and HTML), and source code comments. Vale doesn't attempt to offer a one-size-fits-all collection of rules—instead, it strives to make customization as easy as possible." 61 expected := []string{ 62 "Vale", "is", "a", "natural", "language", "linter", "that", "supports", 63 "plain", "text", ",", "markup", "(", "Markdown", ",", "reStructuredText", 64 ",", "AsciiDoc", ",", "and", "HTML", ")", ",", "and", "source", "code", 65 "comments", ".", "Vale", "does", "n't", "attempt", "to", "offer", "a", 66 "one-size-fits-all", "collection", "of", "rules—instead", ",", "it", 67 "strives", "to", "make", "customization", "as", "easy", "as", "possible", 68 "."} 69 assert.Equal(t, expected, TextToWords(text)) 70} 71