1package tokenize
2
3import (
4	"encoding/json"
5	"fmt"
6	"path/filepath"
7	"testing"
8
9	"github.com/jdkato/prose/internal/util"
10	"github.com/stretchr/testify/assert"
11)
12
13var testdata = filepath.Join("..", "testdata")
14
15func getWordData(file string) ([]string, [][]string) {
16	in := util.ReadDataFile(filepath.Join(testdata, "treebank_sents.json"))
17	out := util.ReadDataFile(filepath.Join(testdata, file))
18
19	input := []string{}
20	output := [][]string{}
21
22	util.CheckError(json.Unmarshal(in, &input))
23	util.CheckError(json.Unmarshal(out, &output))
24
25	return input, output
26}
27
28func getWordBenchData() []string {
29	in := util.ReadDataFile(filepath.Join(testdata, "treebank_sents.json"))
30	input := []string{}
31	util.CheckError(json.Unmarshal(in, &input))
32	return input
33}
34
35func ExampleNewWordBoundaryTokenizer() {
36	t := NewWordBoundaryTokenizer()
37	fmt.Println(t.Tokenize("They'll save and invest more."))
38	// Output: [They'll save and invest more]
39}
40
41func ExampleNewWordPunctTokenizer() {
42	t := NewWordPunctTokenizer()
43	fmt.Println(t.Tokenize("They'll save and invest more."))
44	// Output: [They ' ll save and invest more .]
45}
46
47func ExampleNewTreebankWordTokenizer() {
48	t := NewTreebankWordTokenizer()
49	fmt.Println(t.Tokenize("They'll save and invest more."))
50	// Output: [They 'll save and invest more .]
51}
52
53func ExampleNewBlanklineTokenizer() {
54	t := NewBlanklineTokenizer()
55	fmt.Println(t.Tokenize("They'll save and invest more.\n\nThanks!"))
56	// Output: [They'll save and invest more. Thanks!]
57}
58
59func TestTextToWords(t *testing.T) {
60	text := "Vale is a natural language linter that supports plain text, markup (Markdown, reStructuredText, AsciiDoc, and HTML), and source code comments. Vale doesn't attempt to offer a one-size-fits-all collection of rules—instead, it strives to make customization as easy as possible."
61	expected := []string{
62		"Vale", "is", "a", "natural", "language", "linter", "that", "supports",
63		"plain", "text", ",", "markup", "(", "Markdown", ",", "reStructuredText",
64		",", "AsciiDoc", ",", "and", "HTML", ")", ",", "and", "source", "code",
65		"comments", ".", "Vale", "does", "n't", "attempt", "to", "offer", "a",
66		"one-size-fits-all", "collection", "of", "rules—instead", ",", "it",
67		"strives", "to", "make", "customization", "as", "easy", "as", "possible",
68		"."}
69	assert.Equal(t, expected, TextToWords(text))
70}
71