1package tokenize
2
3import (
4	"encoding/json"
5	"fmt"
6	"path/filepath"
7	"testing"
8
9	"github.com/jdkato/prose/internal/util"
10)
11
12type goldenRule struct {
13	Name   string
14	Input  string
15	Output []string
16}
17
18func TestPragmaticRulesEn(t *testing.T) { testLang("en", t) }
19func TestPragmaticRulesFr(t *testing.T) { testLang("fr", t) }
20func TestPragmaticRulesEs(t *testing.T) { testLang("es", t) }
21
22func BenchmarkPragmaticRulesEn(b *testing.B) { benchmarkLang("en", b) }
23
24func benchmarkLang(lang string, b *testing.B) {
25	tests := make([]goldenRule, 0)
26	f := fmt.Sprintf("golden_rules_%s.json", lang)
27	cases := util.ReadDataFile(filepath.Join(testdata, f))
28
29	tok, err := NewPragmaticSegmenter(lang)
30	util.CheckError(err)
31
32	util.CheckError(json.Unmarshal(cases, &tests))
33	for n := 0; n < b.N; n++ {
34		for _, test := range tests {
35			tok.Tokenize(test.Input)
36		}
37	}
38}
39
40func testLang(lang string, t *testing.T) {
41	tests := make([]goldenRule, 0)
42	f := fmt.Sprintf("golden_rules_%s.json", lang)
43	cases := util.ReadDataFile(filepath.Join(testdata, f))
44
45	tok, err := NewPragmaticSegmenter(lang)
46	util.CheckError(err)
47
48	util.CheckError(json.Unmarshal(cases, &tests))
49	for _, test := range tests {
50		compare(t, test.Name, test.Input, test.Output, tok)
51	}
52}
53
54func compare(t *testing.T, test, actualText string, expected []string, tok *PragmaticSegmenter) bool {
55	actual := tok.Tokenize(actualText)
56	if len(actual) != len(expected) {
57		t.Log(test)
58		t.Logf("Actual: %v\n", actual)
59		t.Errorf("Actual: %d, Expected: %d\n", len(actual), len(expected))
60		t.Log("===")
61		return false
62	}
63	for index, sent := range actual {
64		if sent != expected[index] {
65			t.Log(test)
66			t.Errorf("Actual: [%s] Expected: [%s]\n", sent, expected[index])
67			t.Log("===")
68			return false
69		}
70	}
71	return true
72}
73